From d15c0c7ac118cb23241b002e7206221283e36e2d Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 17 Oct 2012 18:25:06 +0000
Subject: Add a loop vectorizer.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166112 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 801 +++++++++++++++++++++++++++++
 1 file changed, 801 insertions(+)
 create mode 100644 lib/Transforms/Vectorize/LoopVectorize.cpp

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
new file mode 100644
index 0000000..60405e7
--- /dev/null
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -0,0 +1,801 @@
+//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a simple loop vectorizer. We currently only support single block
+// loops. We have a very simple and restrictive legality check: we need to read
+// and write from disjoint memory locations. We still don't have a cost model.
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A helper class that checks for the legality
+//    of the vectorization.
+// 3. SingleBlockLoopVectorizer - A helper class that performs the actual
+//    widening of instructions.
+//
+//===----------------------------------------------------------------------===//
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Value.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+static cl::opt<unsigned>
+DefaultVectorizationFactor("default-loop-vectorize-width",
+                          cl::init(4), cl::Hidden,
+                          cl::desc("Set the default loop vectorization width"));
+
+namespace {
+
+/// Vectorize a simple loop. This class performs the widening of simple single
+/// basic block loops into vectors. It does not perform any
+/// vectorization-legality checks, and just does it.  It widens the vectors
+/// to a given vectorization factor (VF).
+class SingleBlockLoopVectorizer {
+public:
+
+  /// Ctor.
+  SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
+                            unsigned VecWidth):
+  Orig(OrigLoop), SE(Se), LI(Li), VF(VecWidth),
+   Builder(0), Induction(0), OldInduction(0) { }
+
+  ~SingleBlockLoopVectorizer() {
+    delete Builder;
+  }
+
+  // Perform the actual loop widening (vectorization).
+  void vectorize() {
+    ///Create a new empty loop. Unlink the old loop and connect the new one.
+    copyEmptyLoop();
+    /// Widen each instruction in the old loop to a new one in the new loop.
+    vectorizeLoop();
+    // Delete the old loop.
+    deleteOldLoop();
+ }
+
+private:
+  /// Create an empty loop, based on the loop ranges of the old loop.
+  void copyEmptyLoop();
+  /// Copy and widen the instructions from the old loop.
+  void vectorizeLoop();
+  /// Delete the old loop.
+  void deleteOldLoop();
+
+  /// This instruction is un-vectorizable. Implement it as a sequence
+  /// of scalars.
+  void scalarizeInstruction(Instruction *Instr);
+
+  /// Create a broadcast instruction. This method generates a broadcast
+  /// instruction (shuffle) for loop invariant values and for the induction
+  /// value. If this is the induction variable then we extend it to N, N+1, ...
+  /// this is needed because each iteration in the loop corresponds to a SIMD
+  /// element.
+  Value *getBroadcastInstrs(Value *V);
+
+  /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
+  /// for each element in the vector. Starting from zero.
+  Value *getConsecutiveVector(Value* Val);
+
+  /// Check that the GEP operands are all uniform except for the last index
+  /// which has to be the induction variable.
+  bool isConsecutiveGep(GetElementPtrInst *Gep);
+
+  /// When we go over instructions in the basic block we rely on previous
+  /// values within the current basic block or on loop invariant values.
+  /// When we widen (vectorize) values we place them in the map. If the values
+  /// are not within the map, they have to be loop invariant, so we simply
+  /// broadcast them into a vector.
+  Value *getVectorValue(Value *V);
+
+  /// The original loop.
+  Loop *Orig;
+  // Scev analysis to use.
+  ScalarEvolution *SE;
+  // Loop Info.
+  LoopInfo *LI;
+  // The vectorization factor to use.
+  unsigned VF;
+
+  // The builder that we use
+  IRBuilder<> *Builder;
+
+  // --- Vectorization state ---
+
+  /// The new Induction variable which was added to the new block.
+  Instruction *Induction;
+  /// The induction variable of the old basic block.
+  Instruction *OldInduction;
+  // Maps scalars to widened vectors.
+  DenseMap<Value*, Value*> WidenMap;
+};
+
+
+/// Perform the vectorization legality check. This class does not look at the
+/// profitability of vectorization, only the legality. At the moment the checks
+/// are very simple and focus on single basic block loops with a constant
+/// iteration count and no reductions.
+class LoopVectorizationLegality {
+public:
+  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
+  TheLoop(Lp), SE(Se), DL(Dl) { }
+
+  /// Returns the maximum vectorization factor that we *can* use to vectorize
+  /// this loop. This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so. This may be a large number. We
+  /// can vectorize to any SIMD width below this number.
+  unsigned getLoopMaxVF();
+
+private:
+  /// Check if a single basic block loop is vectorizable.
+  /// At this point we know that this is a loop with a constant trip count
+  /// and we only need to check individual instructions.
+  bool canVectorizeBlock(BasicBlock &BB);
+
+  // Check if a pointer value is known to be disjoint.
+  // Example: Alloca, Global, NoAlias.
+  bool isKnownDisjoint(Value* Val);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  /// DataLayout analysis.
+  DataLayout *DL;
+};
+
+struct LoopVectorize : public LoopPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  LoopVectorize() : LoopPass(ID) {
+    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
+  }
+
+  AliasAnalysis *AA;
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  LoopInfo *LI;
+
+  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+    // Only vectorize innermost loops.
+    if (!L->empty())
+      return false;
+
+    AA = &getAnalysis<AliasAnalysis>();
+    SE = &getAnalysis<ScalarEvolution>();
+    DL = getAnalysisIfAvailable<DataLayout>();
+    LI = &getAnalysis<LoopInfo>();
+
+    BasicBlock *Header = L->getHeader();
+    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
+          Header->getParent()->getName() << "\"\n");
+
+    // Check if it is legal to vectorize the loop.
+    LoopVectorizationLegality LVL(L, SE, DL);
+    unsigned MaxVF = LVL.getLoopMaxVF();
+
+    // Check that we can vectorize using the chosen vectorization width.
+    if ((MaxVF < DefaultVectorizationFactor) ||
+        (MaxVF % DefaultVectorizationFactor)) {
+      DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
+      return false;
+    }
+
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
+
+    // If we decided that is is *legal* to vectorizer the loop. Do it.
+    SingleBlockLoopVectorizer LB(L, SE, LI, DefaultVectorizationFactor);
+    LB.vectorize();
+
+    // The loop is now vectorized. Remove it from LMP.
+    LPM.deleteLoopFromQueue(L);
+    return true;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    LoopPass::getAnalysisUsage(AU);
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<LoopInfo>();
+    AU.addRequired<ScalarEvolution>();
+  }
+
+};
+
+Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
+  // Instructions that access the old induction variable
+  // actually want to get the new one.
+  if (V == OldInduction)
+    V = Induction;
+  // Create the types.
+  LLVMContext &C = V->getContext();
+  Type *VTy = VectorType::get(V->getType(), VF);
+  Type *I32 = IntegerType::getInt32Ty(C);
+  Constant *Zero = ConstantInt::get(I32, 0);
+  Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
+  Value *UndefVal = UndefValue::get(VTy);
+  // Insert the value into a new vector.
+  Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero);
+  // Broadcast the scalar into all locations in the vector.
+  Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros,
+                                             "broadcast");
+  // We are accessing the induction variable. Make sure to promote the
+  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
+  if (V == Induction)
+    return getConsecutiveVector(Shuf);
+  return Shuf;
+}
+
+Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
+  assert(Val->getType()->isVectorTy() && "Must be a vector");
+  assert(Val->getType()->getScalarType()->isIntegerTy() &&
+         "Elem must be an integer");
+  // Create the types.
+  Type *ITy = Val->getType()->getScalarType();
+  VectorType *Ty = cast<VectorType>(Val->getType());
+  unsigned VLen = Ty->getNumElements();
+  SmallVector<Constant*, 8> Indices;
+
+  // Create a vector of consecutive numbers from zero to VF.
+  for (unsigned i = 0; i < VLen; ++i)
+    Indices.push_back(ConstantInt::get(ITy, i));
+
+  // Add the consecutive indices to the vector value.
+  Constant *Cv = ConstantVector::get(Indices);
+  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+  return Builder->CreateAdd(Val, Cv, "induction");
+}
+
+
+bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
+  if (!Gep)
+    return false;
+
+  unsigned NumOperands = Gep->getNumOperands();
+  Value *LastIndex = Gep->getOperand(NumOperands - 1);
+
+  // Check that all of the gep indices are uniform except for the last.
+  for (unsigned i = 0; i < NumOperands - 1; ++i)
+    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig))
+      return false;
+
+  // The last operand has to be the induction in order to emit
+  // a wide load/store.
+  const SCEV *Last = SE->getSCEV(LastIndex);
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
+    const SCEV *Step = AR->getStepRecurrence(*SE);
+
+    // The memory is consecutive because the last index is consecutive
+    // and all other indices are loop invariant.
+    if (Step->isOne())
+      return true;
+  }
+
+  return false;
+}
+
+Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+  if (WidenMap.count(V))
+    return WidenMap[V];
+  return getBroadcastInstrs(V);
+}
+
+void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  // Holds vector parameters or scalars, in case of uniform vals.
+  SmallVector<Value*, 8> Params;
+
+  // Find all of the vectorized parameters.
+  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+    Value *SrcOp = Instr->getOperand(op);
+
+    // If we are accessing the old induction variable, use the new one.
+    if (SrcOp == OldInduction) {
+      Params.push_back(getBroadcastInstrs(Induction));
+      continue;
+    }
+
+    // Try using previously calculated values.
+    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+
+    // If the src is an instruction that appeared earlier in the basic block
+    // then it should already be vectorized. 
+    if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
+      assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
+      // The parameter is a vector value from earlier.
+      Params.push_back(WidenMap[SrcInst]);
+    } else {
+      // The parameter is a scalar from outside the loop. Maybe even a constant.
+      Params.push_back(SrcOp);
+    }
+  }
+
+  assert(Params.size() == Instr->getNumOperands() &&
+         "Invalid number of operands");
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+  Value *VecResults = 0;
+
+  // If we have a return value, create an empty vector. We place the scalarized
+  // instructions in this vector.
+  if (!IsVoidRetTy)
+    VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
+
+  // For each scalar that we create.
+  for (unsigned i = 0; i < VF; ++i) {
+    Instruction *Cloned = Instr->clone();
+    if (!IsVoidRetTy)
+      Cloned->setName(Instr->getName() + ".cloned");
+    // Replace the operands of the cloned instrucions with extracted scalars.
+    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+      Value *Op = Params[op];
+      // Param is a vector. Need to extract the right lane.
+      if (Op->getType()->isVectorTy())
+        Op = Builder->CreateExtractElement(Op, Builder->getInt32(i));
+      Cloned->setOperand(op, Op);
+    }
+
+    // Place the clonsed scalar in the new loop.
+    Builder->Insert(Cloned);
+
+    // If the original scalar returns a value we need to place it in a vector
+    // so that future users will be able to use it.
+    if (!IsVoidRetTy)
+      VecResults = Builder->CreateInsertElement(VecResults, Cloned,
+                                               Builder->getInt32(i));
+  }
+
+  if (!IsVoidRetTy)
+    WidenMap[Instr] = VecResults;
+}
+
+void SingleBlockLoopVectorizer::copyEmptyLoop() {
+  assert(Orig->getNumBlocks() == 1 && "Invalid loop");
+  BasicBlock *PH = Orig->getLoopPreheader();
+  BasicBlock *ExitBlock = Orig->getExitBlock();
+  assert(ExitBlock && "Invalid loop exit");
+
+  // Create a new single-basic block loop.
+  BasicBlock *BB = BasicBlock::Create(PH->getContext(), "vectorizedloop",
+                                      PH->getParent(), ExitBlock);
+
+  // Find the induction variable.
+  BasicBlock *OldBasicBlock = Orig->getHeader();
+  PHINode *OldInd = dyn_cast<PHINode>(OldBasicBlock->begin());
+  assert(OldInd && "We must have a single phi node.");
+  Type *IdxTy = OldInd->getType();
+
+  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
+  // inside the loop.
+  Builder = new IRBuilder<>(BB);
+  Builder->SetInsertPoint(BB);
+
+  // Generate the induction variable.
+  PHINode *Phi = Builder->CreatePHI(IdxTy, 2, "index");
+  Constant *Zero = ConstantInt::get(IdxTy, 0);
+  Constant *Step = ConstantInt::get(IdxTy, VF);
+
+  // Find the loop boundaries.
+  const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader());
+  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+
+  // Get the trip count from the count by adding 1.
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(ExitCount->getType(), 1));
+
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, "induction");
+  Instruction *Loc = Orig->getLoopPreheader()->getTerminator();
+  if (ExitCount->getType() != Phi->getType())
+    ExitCount = SE->getSignExtendExpr(ExitCount, Phi->getType());
+  Value *Count = Exp.expandCodeFor(ExitCount, Phi->getType(), Loc);
+  
+  // Create i+1 and fill the PHINode.
+  Value *Next = Builder->CreateAdd(Phi, Step, "index.next");
+  Phi->addIncoming(Zero, PH);
+  Phi->addIncoming(Next, BB);
+  // Create the compare.
+  Value *ICmp = Builder->CreateICmpEQ(Next, Count);
+  Builder->CreateCondBr(ICmp, ExitBlock, BB);
+  // Fix preheader.
+  PH->getTerminator()->setSuccessor(0, BB);
+  Builder->SetInsertPoint(BB->getFirstInsertionPt());
+
+  // Save the indiction variables.
+  Induction = Phi;
+  OldInduction = OldInd;
+}
+
+void SingleBlockLoopVectorizer::vectorizeLoop() {
+  BasicBlock &BB = *Orig->getHeader();
+
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+  Instruction *Inst = it;
+
+    switch (Inst->getOpcode()) {
+      case Instruction::PHI:
+      case Instruction::Br:
+        // Nothing to do for PHIs and BR, since we already took care of the
+        // loop control flow instructions.
+        continue;
+
+      case Instruction::Add:
+      case Instruction::FAdd:
+      case Instruction::Sub:
+      case Instruction::FSub:
+      case Instruction::Mul:
+      case Instruction::FMul:
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::FDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::FRem:
+      case Instruction::Shl:
+      case Instruction::LShr:
+      case Instruction::AShr:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor: {
+        // Just widen binops.
+        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Value *B = getVectorValue(Inst->getOperand(1));
+        // Use this vector value for all users of the original instruction.
+        WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B);
+        break;
+      }
+      case Instruction::Select: {
+        // Widen selects.
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Value *B = getVectorValue(Inst->getOperand(1));
+        Value *C = getVectorValue(Inst->getOperand(2));
+        WidenMap[Inst] = Builder->CreateSelect(A, B, C);
+        break;
+      }
+
+      case Instruction::ICmp:
+      case Instruction::FCmp: {
+        // Widen compares. Generate vector compares.
+        bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
+        CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Value *B = getVectorValue(Inst->getOperand(1));
+        if (FCmp)
+          WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B);
+        else
+          WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B);
+        break;
+      }
+
+      case Instruction::Store: {
+        // Attempt to issue a wide store.
+        StoreInst *SI = dyn_cast<StoreInst>(Inst);
+        Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+        Value *Ptr = SI->getPointerOperand();
+        unsigned Alignment = SI->getAlignment();
+        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+        // This store does not use GEPs.
+        if (!isConsecutiveGep(Gep)) {
+          scalarizeInstruction(Inst);
+          break;
+        }
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        unsigned NumOperands = Gep->getNumOperands();
+        Gep2->setOperand(NumOperands - 1, Induction);
+        Ptr = Builder->Insert(Gep2);
+        Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo());
+        Value *Val = getVectorValue(SI->getValueOperand());
+        Builder->CreateStore(Val, Ptr)->setAlignment(Alignment);
+        break;
+      }
+      case Instruction::Load: {
+        // Attempt to issue a wide load.
+        LoadInst *LI = dyn_cast<LoadInst>(Inst);
+        Type *RetTy = VectorType::get(LI->getType(), VF);
+        Value *Ptr = LI->getPointerOperand();
+        unsigned Alignment = LI->getAlignment();
+        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
+        // We don't have a gep. Scalarize the load.
+        if (!isConsecutiveGep(Gep)) {
+          scalarizeInstruction(Inst);
+          break;
+        }
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        unsigned NumOperands = Gep->getNumOperands();
+        Gep2->setOperand(NumOperands - 1, Induction);
+        Ptr = Builder->Insert(Gep2);
+        Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo());
+        LI = Builder->CreateLoad(Ptr);
+        LI->setAlignment(Alignment);
+        // Use this vector value for all users of the load.
+        WidenMap[Inst] = LI;
+        break;
+      }
+      case Instruction::ZExt:
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast: {
+        /// Vectorize bitcasts.
+        CastInst *CI = dyn_cast<CastInst>(Inst);
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+        WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy);
+        break;
+      }
+
+      default:
+        /// All other instructions are unsupported. Scalarize them.
+        scalarizeInstruction(Inst);
+        break;
+    }// end of switch.
+  }// end of for_each instr.
+}
+
+void SingleBlockLoopVectorizer::deleteOldLoop() {
+  // The original basic block.
+  BasicBlock *BB = Orig->getHeader();
+  SE->forgetLoop(Orig);
+
+  LI->removeBlock(BB);
+  Orig->addBasicBlockToLoop(Induction->getParent(), LI->getBase());
+
+  // Remove the old loop block.
+  DeleteDeadBlock(BB);
+}
+
+unsigned LoopVectorizationLegality::getLoopMaxVF() {
+  if (!TheLoop->getLoopPreheader()) {
+    assert(false && "No preheader!!");
+    DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
+    return  1;
+  }
+
+  // We can only vectorize single basic block loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
+  if (NumBlocks != 1) {
+    DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
+    return 1;
+  }
+
+  // We need to have a loop header.
+  BasicBlock *BB = TheLoop->getHeader();
+  DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
+
+  // Find the max vectorization factor.
+  unsigned MaxVF = SE->getSmallConstantTripMultiple(TheLoop, BB);
+
+
+  // Perform an early check. Do not scan the block if we did not find a loop.
+  if (MaxVF < 2) {
+    DEBUG(dbgs() << "LV: Can't find a vectorizable loop structure\n");
+    return 1;
+  }
+
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeBlock(*BB)) {
+    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+    return 1;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop! VF="<<MaxVF<<"\n");
+  
+  // Okay! We can vectorize. Return the max trip multiple.
+  return MaxVF;
+}
+
+bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+  // Holds the read and write pointers that we find.
+  typedef SmallVector<Value*, 10> ValueVector;
+  ValueVector Reads;
+  ValueVector Writes;
+
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+    Instruction *I = it;
+
+    PHINode *Phi = dyn_cast<PHINode>(I);
+    if (Phi) {
+      NumPhis++;
+      // We only look at integer phi nodes.
+      if (!Phi->getType()->isIntegerTy()) {
+        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+        return false;
+      }
+
+      // If we found an induction variable.
+      if (NumPhis > 1) {
+        DEBUG(dbgs() << "LV: Found more than one PHI.\n");
+        return false;
+      }
+
+      // This should not happen because the loop should be normalized.
+      if (Phi->getNumIncomingValues() != 2) {
+        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+        return false;
+      }
+
+      // Check that the PHI is consecutive and starts at zero.
+      const SCEV *PhiScev = SE->getSCEV(Phi);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+      if (!AR) {
+        DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+        return false;
+      }
+
+      const SCEV *Step = AR->getStepRecurrence(*SE);
+      const SCEV *Start = AR->getStart();
+
+      if (!Step->isOne() || !Start->isZero()) {
+        DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
+        return false;
+      }
+    }
+
+    // IF this is a load, record its pointer. If it is not a load, abort.
+    // Notice that we don't handle function calls that read or write.
+    if (I->mayReadFromMemory()) {
+      LoadInst *Ld = dyn_cast<LoadInst>(I);
+      if (!Ld) return false;
+      if (!Ld->isSimple()) {
+        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+        return false;
+      }
+      GetUnderlyingObjects(Ld->getPointerOperand(), Reads, DL);
+    }
+
+    // Record store pointers. Abort on all other instructions that write to
+    // memory.
+    if (I->mayWriteToMemory()) {
+      StoreInst *St = dyn_cast<StoreInst>(I);
+      if (!St) return false;
+      if (!St->isSimple()) {
+        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+        return false;
+      }
+      GetUnderlyingObjects(St->getPointerOperand(), Writes, DL);
+    }
+
+    // We still don't handle functions.
+    CallInst *CI = dyn_cast<CallInst>(I);
+    if (CI) {
+      DEBUG(dbgs() << "LV: Found a call site:"<<
+            CI->getCalledFunction()->getName() << "\n");
+      return false;
+    }
+
+    // We do not re-vectorize vectors.
+    if (!VectorType::isValidElementType(I->getType()) &&
+        !I->getType()->isVoidTy()) {
+      DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+      return false;
+    }
+    //Check that all of the users of the loop are inside the BB.
+    for (Value::use_iterator it = I->use_begin(), e = I->use_end();
+         it != e; ++it) {
+      Instruction *U = cast<Instruction>(*it);
+      BasicBlock *Parent = U->getParent();
+      if (Parent != &BB) {
+        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        return false;
+      }
+    }
+  } // next instr.
+
+  // Check that the underlying objects of the reads and writes are either
+  // disjoint memory locations, or that they are no-alias arguments.
+  ValueVector::iterator r, re, w, we;
+  for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
+    if (!isKnownDisjoint(*r)) {
+      DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n");
+      return false;
+    }
+  }
+
+  for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
+    if (!isKnownDisjoint(*w)) {
+      DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n");
+      return false;
+    }
+  }
+
+  // Check that there are no multiple write locations to the same pointer.
+  SmallPtrSet<Value*, 8> BasePointers;
+  for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
+    if (BasePointers.count(*w)) {
+      DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n");
+      return false;
+    }
+    BasePointers.insert(*w);
+  }
+
+  // Sort the writes vector so that we can use a binary search.
+  std::sort(Writes.begin(), Writes.end());
+  // Check that the reads and the writes are disjoint.
+  for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
+    if (std::binary_search(Writes.begin(), Writes.end(), *r)) {
+      DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n");
+      return false;
+    }
+  }
+
+  // All is okay.
+  return true;
+}
+
+/// Checks if the value is a Global variable or if it is an Arguments
+/// marked with the NoAlias attribute.
+bool LoopVectorizationLegality::isKnownDisjoint(Value* Val) {
+  assert(Val && "Invalid value");
+  if (dyn_cast<GlobalValue>(Val))
+    return true;
+  if (dyn_cast<AllocaInst>(Val))
+    return true;
+  Argument *A = dyn_cast<Argument>(Val);
+  if (!A)
+    return false;
+  return A->hasNoAliasAttr();
+}
+
+} // namespace
+
+char LoopVectorize::ID = 0;
+static const char lv_name[] = "Loop Vectorization";
+INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
+
+namespace llvm {
+  Pass *createLoopVectorizePass() {
+    return new LoopVectorize();
+  }
+
+}
+
-- 
cgit v1.1


From a5401d67304b9814fc13521563a9634f37f2047b Mon Sep 17 00:00:00 2001
From: Roman Divacky <rdivacky@freebsd.org>
Date: Wed, 17 Oct 2012 21:07:35 +0000
Subject: Fix some typos and wrong indenting.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166128 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 60405e7..ce44db0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -365,7 +365,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
       Cloned->setOperand(op, Op);
     }
 
-    // Place the clonsed scalar in the new loop.
+    // Place the cloned scalar in the new loop.
     Builder->Insert(Cloned);
 
     // If the original scalar returns a value we need to place it in a vector
@@ -432,7 +432,7 @@ void SingleBlockLoopVectorizer::copyEmptyLoop() {
   PH->getTerminator()->setSuccessor(0, BB);
   Builder->SetInsertPoint(BB->getFirstInsertionPt());
 
-  // Save the indiction variables.
+  // Save the induction variables.
   Induction = Phi;
   OldInduction = OldInd;
 }
@@ -442,7 +442,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
 
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-  Instruction *Inst = it;
+    Instruction *Inst = it;
 
     switch (Inst->getOpcode()) {
       case Instruction::PHI:
@@ -677,7 +677,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
       }
     }
 
-    // IF this is a load, record its pointer. If it is not a load, abort.
+    // If this is a load, record its pointer. If it is not a load, abort.
     // Notice that we don't handle function calls that read or write.
     if (I->mayReadFromMemory()) {
       LoadInst *Ld = dyn_cast<LoadInst>(I);
-- 
cgit v1.1


From c2358eb361ee3304e553c0d283c5c3a44f28950f Mon Sep 17 00:00:00 2001
From: Jakub Staszak <kubastaszak@gmail.com>
Date: Wed, 17 Oct 2012 23:06:37 +0000
Subject: Remove redundant SetInsertPoint call.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166138 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 1 -
 1 file changed, 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ce44db0..b256155 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -398,7 +398,6 @@ void SingleBlockLoopVectorizer::copyEmptyLoop() {
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
   Builder = new IRBuilder<>(BB);
-  Builder->SetInsertPoint(BB);
 
   // Generate the induction variable.
   PHINode *Phi = Builder->CreatePHI(IdxTy, 2, "index");
-- 
cgit v1.1


From 6dc0050f6d42520ad92e6786551f27ee3f365ecd Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Wed, 17 Oct 2012 23:40:15 +0000
Subject: LoopVectorize.cpp: Fix a warning. [-Wunused-variable]

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166153 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index b256155..f84e392 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -195,9 +195,8 @@ struct LoopVectorize : public LoopPass {
     DL = getAnalysisIfAvailable<DataLayout>();
     LI = &getAnalysis<LoopInfo>();
 
-    BasicBlock *Header = L->getHeader();
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
-          Header->getParent()->getName() << "\"\n");
+          L->getHeader()->getParent()->getName() << "\"\n");
 
     // Check if it is legal to vectorize the loop.
     LoopVectorizationLegality LVL(L, SE, DL);
-- 
cgit v1.1


From 1953ace81d3121808967a7fa47ad6d631499933d Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 18 Oct 2012 05:29:12 +0000
Subject: Vectorizer: Add support for loops with an unknown count. For example:

     for (i=0; i<n; i++){
        a[i] = b[i+1] + c[i+3];
     }



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166165 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 227 ++++++++++++++++++++---------
 1 file changed, 159 insertions(+), 68 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f84e392..80fdad3 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -28,6 +28,8 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Value.h"
 #include "llvm/Function.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Verifier.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/SmallVector.h"
@@ -65,8 +67,8 @@ public:
 
   /// Ctor.
   SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
-                            unsigned VecWidth):
-  Orig(OrigLoop), SE(Se), LI(Li), VF(VecWidth),
+                            LPPassManager *Lpm, unsigned VecWidth):
+  Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
    Builder(0), Induction(0), OldInduction(0) { }
 
   ~SingleBlockLoopVectorizer() {
@@ -76,20 +78,20 @@ public:
   // Perform the actual loop widening (vectorization).
   void vectorize() {
     ///Create a new empty loop. Unlink the old loop and connect the new one.
-    copyEmptyLoop();
+    createEmptyLoop();
     /// Widen each instruction in the old loop to a new one in the new loop.
     vectorizeLoop();
-    // Delete the old loop.
-    deleteOldLoop();
+    // register the new loop.
+    cleanup();
  }
 
 private:
   /// Create an empty loop, based on the loop ranges of the old loop.
-  void copyEmptyLoop();
+  void createEmptyLoop();
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop();
-  /// Delete the old loop.
-  void deleteOldLoop();
+  /// Insert the new loop to the loop hierarchy and pass manager.
+  void cleanup();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
   /// of scalars.
@@ -123,6 +125,8 @@ private:
   ScalarEvolution *SE;
   // Loop Info.
   LoopInfo *LI;
+  // Loop Pass Manager;
+  LPPassManager *LPM;
   // The vectorization factor to use.
   unsigned VF;
 
@@ -132,9 +136,9 @@ private:
   // --- Vectorization state ---
 
   /// The new Induction variable which was added to the new block.
-  Instruction *Induction;
+  PHINode *Induction;
   /// The induction variable of the old basic block.
-  Instruction *OldInduction;
+  PHINode *OldInduction;
   // Maps scalars to widened vectors.
   DenseMap<Value*, Value*> WidenMap;
 };
@@ -184,6 +188,7 @@ struct LoopVectorize : public LoopPass {
   ScalarEvolution *SE;
   DataLayout *DL;
   LoopInfo *LI;
+  DominatorTree *DT;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // Only vectorize innermost loops.
@@ -194,6 +199,7 @@ struct LoopVectorize : public LoopPass {
     SE = &getAnalysis<ScalarEvolution>();
     DL = getAnalysisIfAvailable<DataLayout>();
     LI = &getAnalysis<LoopInfo>();
+    DT = &getAnalysis<DominatorTree>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
@@ -203,8 +209,7 @@ struct LoopVectorize : public LoopPass {
     unsigned MaxVF = LVL.getLoopMaxVF();
 
     // Check that we can vectorize using the chosen vectorization width.
-    if ((MaxVF < DefaultVectorizationFactor) ||
-        (MaxVF % DefaultVectorizationFactor)) {
+    if (MaxVF < DefaultVectorizationFactor) {
       DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
       return false;
     }
@@ -212,11 +217,10 @@ struct LoopVectorize : public LoopPass {
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
 
     // If we decided that is is *legal* to vectorizer the loop. Do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, DefaultVectorizationFactor);
+    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
     LB.vectorize();
 
-    // The loop is now vectorized. Remove it from LMP.
-    LPM.deleteLoopFromQueue(L);
+    DEBUG(verifyFunction(*L->getHeader()->getParent()));
     return true;
   }
 
@@ -226,6 +230,7 @@ struct LoopVectorize : public LoopPass {
     AU.addRequired<AliasAnalysis>();
     AU.addRequired<LoopInfo>();
     AU.addRequired<ScalarEvolution>();
+    AU.addRequired<DominatorTree>();
   }
 
 };
@@ -327,7 +332,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
 
     // If the src is an instruction that appeared earlier in the basic block
-    // then it should already be vectorized. 
+    // then it should already be vectorized.
     if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
       assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
       // The parameter is a vector value from earlier.
@@ -378,28 +383,71 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
     WidenMap[Instr] = VecResults;
 }
 
-void SingleBlockLoopVectorizer::copyEmptyLoop() {
-  assert(Orig->getNumBlocks() == 1 && "Invalid loop");
-  BasicBlock *PH = Orig->getLoopPreheader();
+void SingleBlockLoopVectorizer::createEmptyLoop() {
+  /*
+   In this function we generate a new loop. The new loop will contain
+   the vectorized instructions while the old loop will continue to run the
+   scalar remainder.
+
+   [  ] <-- vector loop bypass.
+  /  |
+ /   v
+|   [ ]     <-- vector pre header.
+|    |
+|    v
+|   [  ] \
+|   [  ]_|   <-- vector loop.
+|    |
+ \   v
+   >[ ]   <--- middle-block.
+  /  |
+ /   v
+|   [ ]     <--- new preheader.
+|    |
+|    v
+|   [ ] \
+|   [ ]_|   <-- old scalar loop to handle remainder. ()
+ \   |
+  \  v
+   >[ ]     <-- exit block.
+   ...
+   */
+
+  // This is the original scalar-loop preheader.
+  BasicBlock *BypassBlock = Orig->getLoopPreheader();
   BasicBlock *ExitBlock = Orig->getExitBlock();
-  assert(ExitBlock && "Invalid loop exit");
+  assert(ExitBlock && "Must have an exit block");
+
+  BasicBlock *ScalarBody = Orig->getHeader();
+  assert(Orig->getNumBlocks() == 1 && "Invalid loop");
+  assert(ScalarBody && BypassBlock && "Invalid loop structure");
+
+  BasicBlock *VectorPH =
+      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
+                                                 "vector.body");
 
-  // Create a new single-basic block loop.
-  BasicBlock *BB = BasicBlock::Create(PH->getContext(), "vectorizedloop",
-                                      PH->getParent(), ExitBlock);
+  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
+                                                  "middle.block");
+
+
+  BasicBlock *ScalarPH =
+          MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+                                       "scalar.preheader");
 
   // Find the induction variable.
   BasicBlock *OldBasicBlock = Orig->getHeader();
-  PHINode *OldInd = dyn_cast<PHINode>(OldBasicBlock->begin());
-  assert(OldInd && "We must have a single phi node.");
-  Type *IdxTy = OldInd->getType();
+  OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
+  assert(OldInduction && "We must have a single phi node.");
+  Type *IdxTy = OldInduction->getType();
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
-  Builder = new IRBuilder<>(BB);
+  Builder = new IRBuilder<>(VecBody);
+  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
 
   // Generate the induction variable.
-  PHINode *Phi = Builder->CreatePHI(IdxTy, 2, "index");
+  Induction = Builder->CreatePHI(IdxTy, 2, "index");
   Constant *Zero = ConstantInt::get(IdxTy, 0);
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
@@ -407,32 +455,78 @@ void SingleBlockLoopVectorizer::copyEmptyLoop() {
   const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader());
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
-  // Get the trip count from the count by adding 1.
+  // Get the total trip count from the count by adding 1.
   ExitCount = SE->getAddExpr(ExitCount,
                              SE->getConstant(ExitCount->getType(), 1));
 
   // Expand the trip count and place the new instructions in the preheader.
   // Notice that the pre-header does not change, only the loop body.
   SCEVExpander Exp(*SE, "induction");
-  Instruction *Loc = Orig->getLoopPreheader()->getTerminator();
-  if (ExitCount->getType() != Phi->getType())
-    ExitCount = SE->getSignExtendExpr(ExitCount, Phi->getType());
-  Value *Count = Exp.expandCodeFor(ExitCount, Phi->getType(), Loc);
-  
+  Instruction *Loc = BypassBlock->getTerminator();
+
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  // We are using Zext because it should be less expensive.
+  if (ExitCount->getType() != Induction->getType())
+    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+  // Now we need to generate the expression for N - (N % VF), which is
+  // the part that the vectorized body will execute.
+  Constant *CIVF = ConstantInt::get(IdxTy, VF);
+  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
+  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+
+  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                               CountRoundDown, ConstantInt::getNullValue(IdxTy),
+                               "cmp.zero", Loc);
+  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
+  // Remove the old terminator.
+  Loc->eraseFromParent();
+
+  // Add a check in the middle block to see if we have completed
+  // all of the iterations in the first vector loop.
+  // If (N - N%VF) == N, then we *don't* need to run the remainder.
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                                CountRoundDown, "cmp.n",
+                                MiddleBlock->getTerminator());
+
+  BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
+  // Remove the old terminator.
+  MiddleBlock->getTerminator()->eraseFromParent();
+
   // Create i+1 and fill the PHINode.
-  Value *Next = Builder->CreateAdd(Phi, Step, "index.next");
-  Phi->addIncoming(Zero, PH);
-  Phi->addIncoming(Next, BB);
+  Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next");
+  Induction->addIncoming(Zero, VectorPH);
+  Induction->addIncoming(NextIdx, VecBody);
   // Create the compare.
-  Value *ICmp = Builder->CreateICmpEQ(Next, Count);
-  Builder->CreateCondBr(ICmp, ExitBlock, BB);
-  // Fix preheader.
-  PH->getTerminator()->setSuccessor(0, BB);
-  Builder->SetInsertPoint(BB->getFirstInsertionPt());
-
-  // Save the induction variables.
-  Induction = Phi;
-  OldInduction = OldInd;
+  Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown);
+  Builder->CreateCondBr(ICmp, MiddleBlock, VecBody);
+
+  // Now we have two terminators. Remove the old one from the block.
+  VecBody->getTerminator()->eraseFromParent();
+
+  // Fix the scalar body iteration count.
+  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
+  OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
+
+  // Get ready to start creating new instructions into the vectorized body.
+  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Register the new loop.
+  Loop* Lp = new Loop();
+  LPM->insertLoop(Lp, Orig->getParentLoop());
+
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
+  Loop *ParentLoop = Orig->getParentLoop();
+  if (ParentLoop) {
+    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  }
 }
 
 void SingleBlockLoopVectorizer::vectorizeLoop() {
@@ -575,16 +669,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
   }// end of for_each instr.
 }
 
-void SingleBlockLoopVectorizer::deleteOldLoop() {
+void SingleBlockLoopVectorizer::cleanup() {
   // The original basic block.
-  BasicBlock *BB = Orig->getHeader();
   SE->forgetLoop(Orig);
-
-  LI->removeBlock(BB);
-  Orig->addBasicBlockToLoop(Induction->getParent(), LI->getBase());
-
-  // Remove the old loop block.
-  DeleteDeadBlock(BB);
 }
 
 unsigned LoopVectorizationLegality::getLoopMaxVF() {
@@ -605,26 +692,25 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
   BasicBlock *BB = TheLoop->getHeader();
   DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
 
-  // Find the max vectorization factor.
-  unsigned MaxVF = SE->getSmallConstantTripMultiple(TheLoop, BB);
-
-
-  // Perform an early check. Do not scan the block if we did not find a loop.
-  if (MaxVF < 2) {
-    DEBUG(dbgs() << "LV: Can't find a vectorizable loop structure\n");
-    return 1;
-  }
-
   // Go over each instruction and look at memory deps.
   if (!canVectorizeBlock(*BB)) {
     DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
     return 1;
   }
 
-  DEBUG(dbgs() << "LV: We can vectorize this loop! VF="<<MaxVF<<"\n");
-  
-  // Okay! We can vectorize. Return the max trip multiple.
-  return MaxVF;
+  // ScalarEvolution needs to be able to find the exit count.
+  const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
+    return 1;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+
+  // Okay! We can vectorize. At this point we don't have any other mem analysis
+  // which may limit our maximum vectorization factor, so just return the
+  // maximum SIMD size.
+  return DefaultVectorizationFactor;
 }
 
 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
@@ -725,6 +811,11 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
     }
   } // next instr.
 
+  if (NumPhis != 1) {
+      DEBUG(dbgs() << "LV: Did not find a Phi node.\n");
+      return false;
+  }
+
   // Check that the underlying objects of the reads and writes are either
   // disjoint memory locations, or that they are no-alias arguments.
   ValueVector::iterator r, re, w, we;
-- 
cgit v1.1


From d22d5f9122de2c8527f1d8890ce2802a89e0db0c Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 18 Oct 2012 05:33:02 +0000
Subject: Remove the use of dominators and AA.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166167 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 80fdad3..6ce303c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Value.h"
 #include "llvm/Function.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
@@ -143,7 +142,6 @@ private:
   DenseMap<Value*, Value*> WidenMap;
 };
 
-
 /// Perform the vectorization legality check. This class does not look at the
 /// profitability of vectorization, only the legality. At the moment the checks
 /// are very simple and focus on single basic block loops with a constant
@@ -184,22 +182,18 @@ struct LoopVectorize : public LoopPass {
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
-  AliasAnalysis *AA;
   ScalarEvolution *SE;
   DataLayout *DL;
   LoopInfo *LI;
-  DominatorTree *DT;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // Only vectorize innermost loops.
     if (!L->empty())
       return false;
 
-    AA = &getAnalysis<AliasAnalysis>();
     SE = &getAnalysis<ScalarEvolution>();
     DL = getAnalysisIfAvailable<DataLayout>();
     LI = &getAnalysis<LoopInfo>();
-    DT = &getAnalysis<DominatorTree>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
@@ -227,10 +221,8 @@ struct LoopVectorize : public LoopPass {
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     LoopPass::getAnalysisUsage(AU);
     AU.addRequiredID(LoopSimplifyID);
-    AU.addRequired<AliasAnalysis>();
     AU.addRequired<LoopInfo>();
     AU.addRequired<ScalarEvolution>();
-    AU.addRequired<DominatorTree>();
   }
 
 };
-- 
cgit v1.1


From 3141d2db037bb3c31a1efd50118df45c1aebbb51 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 18 Oct 2012 06:09:21 +0000
Subject: remove unused variable to fix a warning.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166170 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ce303c..e3b8fc7 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -410,9 +410,8 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   BasicBlock *ExitBlock = Orig->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
-  BasicBlock *ScalarBody = Orig->getHeader();
   assert(Orig->getNumBlocks() == 1 && "Invalid loop");
-  assert(ScalarBody && BypassBlock && "Invalid loop structure");
+  assert(BypassBlock && "Invalid loop structure");
 
   BasicBlock *VectorPH =
       BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
-- 
cgit v1.1


From bef36ac2a47dd647af332b8594031df6fe2522c5 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 18 Oct 2012 17:31:49 +0000
Subject: When looking for a vector representation of a scalar, do a single
 lookup. Also, cache the result of the broadcast instruction. No functionality
 change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166191 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e3b8fc7..cb67163 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -63,7 +63,6 @@ namespace {
 /// to a given vectorization factor (VF).
 class SingleBlockLoopVectorizer {
 public:
-
   /// Ctor.
   SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
                             LPPassManager *Lpm, unsigned VecWidth):
@@ -118,6 +117,8 @@ private:
   /// broadcast them into a vector.
   Value *getVectorValue(Value *V);
 
+  typedef DenseMap<Value*, Value*> ValueMap;
+
   /// The original loop.
   Loop *Orig;
   // Scev analysis to use.
@@ -139,7 +140,7 @@ private:
   /// The induction variable of the old basic block.
   PHINode *OldInduction;
   // Maps scalars to widened vectors.
-  DenseMap<Value*, Value*> WidenMap;
+  ValueMap WidenMap;
 };
 
 /// Perform the vectorization legality check. This class does not look at the
@@ -284,8 +285,8 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig))
       return false;
 
-  // The last operand has to be the induction in order to emit
-  // a wide load/store.
+  // We can emit wide load/stores only of the last index is the induction
+  // variable.
   const SCEV *Last = SE->getSCEV(LastIndex);
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
     const SCEV *Step = AR->getStepRecurrence(*SE);
@@ -300,9 +301,15 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
 }
 
 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
-  if (WidenMap.count(V))
-    return WidenMap[V];
-  return getBroadcastInstrs(V);
+  // If we saved a vectorized copy of V, use it.
+  ValueMap::iterator it = WidenMap.find(V);
+  if (it != WidenMap.end())
+     return it->second;
+
+  // Broadcast V and save the value for future uses.
+  Value *B = getBroadcastInstrs(V);
+  WidenMap[V] = B;
+  return B;
 }
 
 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
-- 
cgit v1.1


From b943d9d497175ce44cca7b7bb14b83a86dba7d76 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 18 Oct 2012 18:34:50 +0000
Subject: Avoid reconstructing the pointer set when searching for duplicated
 read/write pointers.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166205 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb67163..9bbd9ab 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -166,7 +166,7 @@ private:
 
   // Check if a pointer value is known to be disjoint.
   // Example: Alloca, Global, NoAlias.
-  bool isKnownDisjoint(Value* Val);
+  bool isidentifiedSafeObject(Value* Val);
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -818,34 +818,31 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
   // disjoint memory locations, or that they are no-alias arguments.
   ValueVector::iterator r, re, w, we;
   for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
-    if (!isKnownDisjoint(*r)) {
+    if (!isidentifiedSafeObject(*r)) {
       DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n");
       return false;
     }
   }
 
   for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
-    if (!isKnownDisjoint(*w)) {
+    if (!isidentifiedSafeObject(*w)) {
       DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n");
       return false;
     }
   }
 
   // Check that there are no multiple write locations to the same pointer.
-  SmallPtrSet<Value*, 8> BasePointers;
+  SmallPtrSet<Value*, 8> WritePointerSet;
   for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
-    if (BasePointers.count(*w)) {
+    if (!WritePointerSet.insert(*w)) {
       DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n");
       return false;
     }
-    BasePointers.insert(*w);
   }
 
-  // Sort the writes vector so that we can use a binary search.
-  std::sort(Writes.begin(), Writes.end());
   // Check that the reads and the writes are disjoint.
   for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
-    if (std::binary_search(Writes.begin(), Writes.end(), *r)) {
+    if (WritePointerSet.count(*r)) {
       DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n");
       return false;
     }
@@ -857,7 +854,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
 
 /// Checks if the value is a Global variable or if it is an Arguments
 /// marked with the NoAlias attribute.
-bool LoopVectorizationLegality::isKnownDisjoint(Value* Val) {
+bool LoopVectorizationLegality::isidentifiedSafeObject(Value* Val) {
   assert(Val && "Invalid value");
   if (dyn_cast<GlobalValue>(Val))
     return true;
-- 
cgit v1.1


From 6220fb16c1d08534e770785f47717ee80baac517 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 18 Oct 2012 21:45:31 +0000
Subject: fix a naming typo

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166232 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9bbd9ab..7af4fa1 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -166,7 +166,7 @@ private:
 
   // Check if a pointer value is known to be disjoint.
   // Example: Alloca, Global, NoAlias.
-  bool isidentifiedSafeObject(Value* Val);
+  bool isIdentifiedSafeObject(Value* Val);
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -427,8 +427,6 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
 
   BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
                                                   "middle.block");
-
-
   BasicBlock *ScalarPH =
           MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
                                        "scalar.preheader");
@@ -818,14 +816,14 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
   // disjoint memory locations, or that they are no-alias arguments.
   ValueVector::iterator r, re, w, we;
   for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
-    if (!isidentifiedSafeObject(*r)) {
+    if (!isIdentifiedSafeObject(*r)) {
       DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n");
       return false;
     }
   }
 
   for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
-    if (!isidentifiedSafeObject(*w)) {
+    if (!isIdentifiedSafeObject(*w)) {
       DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n");
       return false;
     }
@@ -854,7 +852,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
 
 /// Checks if the value is a Global variable or if it is an Arguments
 /// marked with the NoAlias attribute.
-bool LoopVectorizationLegality::isidentifiedSafeObject(Value* Val) {
+bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
   assert(Val && "Invalid value");
   if (dyn_cast<GlobalValue>(Val))
     return true;
-- 
cgit v1.1


From ebd3f27c7e2c6c3a1b76786da0d0205a5fdb1ef5 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 18 Oct 2012 23:21:01 +0000
Subject: cleanup the comment.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166247 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7af4fa1..f5c9bb3 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -405,7 +405,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
 |    |
 |    v
 |   [ ] \
-|   [ ]_|   <-- old scalar loop to handle remainder. ()
+|   [ ]_|   <-- old scalar loop to handle remainder.
  \   |
   \  v
    >[ ]     <-- exit block.
-- 
cgit v1.1


From 89e7b356f270e29c2e9e18c6bbd30e5925585f06 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 19 Oct 2012 01:24:18 +0000
Subject: vectorizer: Add support for reading and writing from the same memory
 location.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166255 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f5c9bb3..5152ec1 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -715,6 +715,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
   ValueVector Reads;
   ValueVector Writes;
 
+  SmallPtrSet<Value*, 16> AnalyzedPtrs;
   unsigned NumPhis = 0;
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *I = it;
@@ -766,7 +767,10 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found a non-simple load.\n");
         return false;
       }
-      GetUnderlyingObjects(Ld->getPointerOperand(), Reads, DL);
+
+      Value* Ptr = Ld->getPointerOperand();
+      if (AnalyzedPtrs.insert(Ptr))
+        GetUnderlyingObjects(Ptr, Reads, DL);
     }
 
     // Record store pointers. Abort on all other instructions that write to
@@ -778,7 +782,10 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found a non-simple store.\n");
         return false;
       }
-      GetUnderlyingObjects(St->getPointerOperand(), Writes, DL);
+
+      Value* Ptr = St->getPointerOperand();
+      if (AnalyzedPtrs.insert(Ptr))
+        GetUnderlyingObjects(St->getPointerOperand(), Writes, DL);
     }
 
     // We still don't handle functions.
-- 
cgit v1.1


From 17f68c52d2f3b6b2472b07492d76520df230afcd Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 19 Oct 2012 08:42:02 +0000
Subject: LoopVectorize: Keep the IRBuilder on the stack.

No functionality change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166274 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 77 ++++++++++++++----------------
 1 file changed, 36 insertions(+), 41 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5152ec1..1602e29 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -67,11 +67,7 @@ public:
   SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
                             LPPassManager *Lpm, unsigned VecWidth):
   Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
-   Builder(0), Induction(0), OldInduction(0) { }
-
-  ~SingleBlockLoopVectorizer() {
-    delete Builder;
-  }
+  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
   void vectorize() {
@@ -81,7 +77,7 @@ public:
     vectorizeLoop();
     // register the new loop.
     cleanup();
- }
+  }
 
 private:
   /// Create an empty loop, based on the loop ranges of the old loop.
@@ -131,7 +127,7 @@ private:
   unsigned VF;
 
   // The builder that we use
-  IRBuilder<> *Builder;
+  IRBuilder<> Builder;
 
   // --- Vectorization state ---
 
@@ -241,10 +237,10 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
   Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
   Value *UndefVal = UndefValue::get(VTy);
   // Insert the value into a new vector.
-  Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero);
+  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
   // Broadcast the scalar into all locations in the vector.
-  Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros,
-                                             "broadcast");
+  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
+                                            "broadcast");
   // We are accessing the induction variable. Make sure to promote the
   // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
   if (V == Induction)
@@ -269,7 +265,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
   // Add the consecutive indices to the vector value.
   Constant *Cv = ConstantVector::get(Indices);
   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
-  return Builder->CreateAdd(Val, Cv, "induction");
+  return Builder.CreateAdd(Val, Cv, "induction");
 }
 
 
@@ -304,7 +300,7 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
   // If we saved a vectorized copy of V, use it.
   ValueMap::iterator it = WidenMap.find(V);
   if (it != WidenMap.end())
-     return it->second;
+    return it->second;
 
   // Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
@@ -364,18 +360,18 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
       Value *Op = Params[op];
       // Param is a vector. Need to extract the right lane.
       if (Op->getType()->isVectorTy())
-        Op = Builder->CreateExtractElement(Op, Builder->getInt32(i));
+        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
       Cloned->setOperand(op, Op);
     }
 
     // Place the cloned scalar in the new loop.
-    Builder->Insert(Cloned);
+    Builder.Insert(Cloned);
 
     // If the original scalar returns a value we need to place it in a vector
     // so that future users will be able to use it.
     if (!IsVoidRetTy)
-      VecResults = Builder->CreateInsertElement(VecResults, Cloned,
-                                               Builder->getInt32(i));
+      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
+                                               Builder.getInt32(i));
   }
 
   if (!IsVoidRetTy)
@@ -421,15 +417,15 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   assert(BypassBlock && "Invalid loop structure");
 
   BasicBlock *VectorPH =
-      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+    BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
   BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
-                                                 "vector.body");
+                                                  "vector.body");
 
   BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
-                                                  "middle.block");
+                                                     "middle.block");
   BasicBlock *ScalarPH =
-          MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                       "scalar.preheader");
+    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+                                 "scalar.preheader");
 
   // Find the induction variable.
   BasicBlock *OldBasicBlock = Orig->getHeader();
@@ -439,11 +435,10 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
-  Builder = new IRBuilder<>(VecBody);
-  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
   // Generate the induction variable.
-  Induction = Builder->CreatePHI(IdxTy, 2, "index");
+  Induction = Builder.CreatePHI(IdxTy, 2, "index");
   Constant *Zero = ConstantInt::get(IdxTy, 0);
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
@@ -494,12 +489,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   MiddleBlock->getTerminator()->eraseFromParent();
 
   // Create i+1 and fill the PHINode.
-  Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next");
+  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
   Induction->addIncoming(Zero, VectorPH);
   Induction->addIncoming(NextIdx, VecBody);
   // Create the compare.
-  Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown);
-  Builder->CreateCondBr(ICmp, MiddleBlock, VecBody);
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
+  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
 
   // Now we have two terminators. Remove the old one from the block.
   VecBody->getTerminator()->eraseFromParent();
@@ -509,7 +504,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
 
   // Get ready to start creating new instructions into the vectorized body.
-  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
   // Register the new loop.
   Loop* Lp = new Loop();
@@ -562,7 +557,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         // Use this vector value for all users of the original instruction.
-        WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
         break;
       }
       case Instruction::Select: {
@@ -570,7 +565,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         Value *C = getVectorValue(Inst->getOperand(2));
-        WidenMap[Inst] = Builder->CreateSelect(A, B, C);
+        WidenMap[Inst] = Builder.CreateSelect(A, B, C);
         break;
       }
 
@@ -582,9 +577,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         if (FCmp)
-          WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
         else
-          WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
         break;
       }
 
@@ -605,10 +600,10 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         unsigned NumOperands = Gep->getNumOperands();
         Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder->Insert(Gep2);
-        Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo());
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
         Value *Val = getVectorValue(SI->getValueOperand());
-        Builder->CreateStore(Val, Ptr)->setAlignment(Alignment);
+        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
         break;
       }
       case Instruction::Load: {
@@ -629,9 +624,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         unsigned NumOperands = Gep->getNumOperands();
         Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder->Insert(Gep2);
-        Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo());
-        LI = Builder->CreateLoad(Ptr);
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
+        LI = Builder.CreateLoad(Ptr);
         LI->setAlignment(Alignment);
         // Use this vector value for all users of the load.
         WidenMap[Inst] = LI;
@@ -653,7 +648,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         CastInst *CI = dyn_cast<CastInst>(Inst);
         Value *A = getVectorValue(Inst->getOperand(0));
         Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-        WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy);
+        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
         break;
       }
 
@@ -815,8 +810,8 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
   } // next instr.
 
   if (NumPhis != 1) {
-      DEBUG(dbgs() << "LV: Did not find a Phi node.\n");
-      return false;
+    DEBUG(dbgs() << "LV: Did not find a Phi node.\n");
+    return false;
   }
 
   // Check that the underlying objects of the reads and writes are either
-- 
cgit v1.1


From 5dbe64e2bc2e4b96654703e85f909536df7ddb84 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 19 Oct 2012 23:05:40 +0000
Subject: Vectorizer: Add support for loop reductions.

For example:

  for (i=0; i<n; i++)
   sum += A[i] +  B[i] + i;



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166351 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 466 ++++++++++++++++++++++++-----
 1 file changed, 392 insertions(+), 74 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1602e29..309b60f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10,6 +10,8 @@
 // This is a simple loop vectorizer. We currently only support single block
 // loops. We have a very simple and restrictive legality check: we need to read
 // and write from disjoint memory locations. We still don't have a cost model.
+// We do support integer reductions.
+//
 // This pass has three parts:
 // 1. The main loop pass that drives the different parts.
 // 2. LoopVectorizationLegality - A helper class that checks for the legality
@@ -54,9 +56,11 @@ static cl::opt<unsigned>
 DefaultVectorizationFactor("default-loop-vectorize-width",
                           cl::init(4), cl::Hidden,
                           cl::desc("Set the default loop vectorization width"));
-
 namespace {
 
+// Forward declaration.
+class LoopVectorizationLegality;
+
 /// Vectorize a simple loop. This class performs the widening of simple single
 /// basic block loops into vectors. It does not perform any
 /// vectorization-legality checks, and just does it.  It widens the vectors
@@ -67,23 +71,28 @@ public:
   SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
                             LPPassManager *Lpm, unsigned VecWidth):
   Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
-  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
+   Builder(0), Induction(0), OldInduction(0) { }
+
+  ~SingleBlockLoopVectorizer() {
+    delete Builder;
+  }
 
   // Perform the actual loop widening (vectorization).
-  void vectorize() {
+  void vectorize(LoopVectorizationLegality *Legal) {
     ///Create a new empty loop. Unlink the old loop and connect the new one.
     createEmptyLoop();
     /// Widen each instruction in the old loop to a new one in the new loop.
-    vectorizeLoop();
+    /// Use the Legality module to find the induction and reduction variables.
+   vectorizeLoop(Legal);
     // register the new loop.
     cleanup();
-  }
+ }
 
 private:
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop();
   /// Copy and widen the instructions from the old loop.
-  void vectorizeLoop();
+  void vectorizeLoop(LoopVectorizationLegality *Legal);
   /// Insert the new loop to the loop hierarchy and pass manager.
   void cleanup();
 
@@ -113,6 +122,10 @@ private:
   /// broadcast them into a vector.
   Value *getVectorValue(Value *V);
 
+  /// Get a uniform vector of constant integers. We use this to get
+  /// vectors of ones and zeros for the reduction code.
+  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
+
   typedef DenseMap<Value*, Value*> ValueMap;
 
   /// The original loop.
@@ -127,10 +140,21 @@ private:
   unsigned VF;
 
   // The builder that we use
-  IRBuilder<> Builder;
+  IRBuilder<> *Builder;
 
   // --- Vectorization state ---
 
+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+  ///The ExitBlock of the scalar loop.
+  BasicBlock *LoopExitBlock;
+  ///The vector loop body.
+  BasicBlock *LoopVectorBody;
+  ///The scalar loop body.
+  BasicBlock *LoopScalarBody;
+  ///The first bypass block.
+  BasicBlock *LoopBypassBlock;
+
   /// The new Induction variable which was added to the new block.
   PHINode *Induction;
   /// The induction variable of the old basic block.
@@ -146,7 +170,23 @@ private:
 class LoopVectorizationLegality {
 public:
   LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
-  TheLoop(Lp), SE(Se), DL(Dl) { }
+  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
+
+  /// This represents the kinds of reductions that we support.
+  enum ReductionKind {
+    IntegerAdd, /// Sum of numbers.
+    IntegerMult, /// Product of numbers.
+    NoReduction /// Not a reduction.
+  };
+
+  // Holds a pairing of reduction instruction and the reduction kind.
+  typedef std::pair<Instruction*, ReductionKind> ReductionPair;
+
+  /// ReductionList contains the reduction variables
+  /// as well as a single EXIT (from the block) value and the kind of
+  /// reduction variable..
+  /// Notice that the EXIT instruction can also be the PHI itself.
+  typedef DenseMap<PHINode*, ReductionPair> ReductionList;
 
   /// Returns the maximum vectorization factor that we *can* use to vectorize
   /// this loop. This does not mean that it is profitable to vectorize this
@@ -154,6 +194,12 @@ public:
   /// can vectorize to any SIMD width below this number.
   unsigned getLoopMaxVF();
 
+  /// Returns the Induction variable.
+  PHINode *getInduction() {return Induction;}
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -164,12 +210,32 @@ private:
   // Example: Alloca, Global, NoAlias.
   bool isIdentifiedSafeObject(Value* Val);
 
+  /// Returns True, if 'Phi' is the kind of reduction variable for type
+  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
+  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
+  /// Checks if a constant matches the reduction kind.
+  /// Sums starts with zero. Products start at one.
+  bool isReductionConstant(Value *V, ReductionKind Kind);
+  /// Returns true if the instruction I can be a reduction variable of type
+  /// 'Kind'.
+  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
   ScalarEvolution *SE;
   /// DataLayout analysis.
   DataLayout *DL;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the induction variable.
+  PHINode *Induction;
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+  /// Allowed outside users. This holds the reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value*, 4> AllowedExit;
 };
 
 struct LoopVectorize : public LoopPass {
@@ -184,6 +250,7 @@ struct LoopVectorize : public LoopPass {
   LoopInfo *LI;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+
     // Only vectorize innermost loops.
     if (!L->empty())
       return false;
@@ -209,7 +276,7 @@ struct LoopVectorize : public LoopPass {
 
     // If we decided that is is *legal* to vectorizer the loop. Do it.
     SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
-    LB.vectorize();
+    LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
     return true;
@@ -218,6 +285,7 @@ struct LoopVectorize : public LoopPass {
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     LoopPass::getAnalysisUsage(AU);
     AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
     AU.addRequired<LoopInfo>();
     AU.addRequired<ScalarEvolution>();
   }
@@ -237,10 +305,10 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
   Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
   Value *UndefVal = UndefValue::get(VTy);
   // Insert the value into a new vector.
-  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
+  Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero);
   // Broadcast the scalar into all locations in the vector.
-  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
-                                            "broadcast");
+  Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros,
+                                             "broadcast");
   // We are accessing the induction variable. Make sure to promote the
   // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
   if (V == Induction)
@@ -265,7 +333,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
   // Add the consecutive indices to the vector value.
   Constant *Cv = ConstantVector::get(Indices);
   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
-  return Builder.CreateAdd(Val, Cv, "induction");
+  return Builder->CreateAdd(Val, Cv, "induction");
 }
 
 
@@ -297,10 +365,11 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
 }
 
 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   // If we saved a vectorized copy of V, use it.
   ValueMap::iterator it = WidenMap.find(V);
   if (it != WidenMap.end())
-    return it->second;
+     return it->second;
 
   // Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
@@ -308,6 +377,17 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
   return B;
 }
 
+Constant*
+SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
+  SmallVector<Constant*, 8> Indices;
+  // Create a vector of consecutive numbers from zero to VF.
+  for (unsigned i = 0; i < VF; ++i)
+    Indices.push_back(ConstantInt::get(ScalarTy, Val));
+
+  // Add the consecutive indices to the vector value.
+  return ConstantVector::get(Indices);
+}
+
 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
@@ -360,18 +440,18 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
       Value *Op = Params[op];
       // Param is a vector. Need to extract the right lane.
       if (Op->getType()->isVectorTy())
-        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
+        Op = Builder->CreateExtractElement(Op, Builder->getInt32(i));
       Cloned->setOperand(op, Op);
     }
 
     // Place the cloned scalar in the new loop.
-    Builder.Insert(Cloned);
+    Builder->Insert(Cloned);
 
     // If the original scalar returns a value we need to place it in a vector
     // so that future users will be able to use it.
     if (!IsVoidRetTy)
-      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
-                                               Builder.getInt32(i));
+      VecResults = Builder->CreateInsertElement(VecResults, Cloned,
+                                               Builder->getInt32(i));
   }
 
   if (!IsVoidRetTy)
@@ -417,16 +497,15 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   assert(BypassBlock && "Invalid loop structure");
 
   BasicBlock *VectorPH =
-    BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
   BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
-                                                  "vector.body");
+                                                 "vector.body");
 
   BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
-                                                     "middle.block");
+                                                  "middle.block");
   BasicBlock *ScalarPH =
-    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                 "scalar.preheader");
-
+          MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+                                       "scalar.preheader");
   // Find the induction variable.
   BasicBlock *OldBasicBlock = Orig->getHeader();
   OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
@@ -435,10 +514,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder = new IRBuilder<>(VecBody);
+  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
 
   // Generate the induction variable.
-  Induction = Builder.CreatePHI(IdxTy, 2, "index");
+  Induction = Builder->CreatePHI(IdxTy, 2, "index");
   Constant *Zero = ConstantInt::get(IdxTy, 0);
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
@@ -489,12 +569,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   MiddleBlock->getTerminator()->eraseFromParent();
 
   // Create i+1 and fill the PHINode.
-  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
+  Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next");
   Induction->addIncoming(Zero, VectorPH);
   Induction->addIncoming(NextIdx, VecBody);
   // Create the compare.
-  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
-  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
+  Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown);
+  Builder->CreateCondBr(ICmp, MiddleBlock, VecBody);
 
   // Now we have two terminators. Remove the old one from the block.
   VecBody->getTerminator()->eraseFromParent();
@@ -504,7 +584,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
 
   // Get ready to start creating new instructions into the vectorized body.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
 
   // Register the new loop.
   Loop* Lp = new Loop();
@@ -518,22 +598,52 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
   }
+
+  // Save the state.
+  LoopMiddleBlock = MiddleBlock;
+  LoopExitBlock = ExitBlock;
+  LoopVectorBody = VecBody;
+  LoopScalarBody = OldBasicBlock;
+  LoopBypassBlock = BypassBlock;
 }
 
-void SingleBlockLoopVectorizer::vectorizeLoop() {
+void
+SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+  typedef SmallVector<PHINode*, 4> PhiVector;
   BasicBlock &BB = *Orig->getHeader();
 
+  // In order to support reduction variables we need to be able to vectorize
+  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
+  // steages. First, we create a new vector PHI node with no incoming edges.
+  // We use this value when we vectorize all of the instructions that use the
+  // PHI. Next, after all of the instructions in the block are complete we
+  // add the new incoming edges to the PHI. At this point all of the
+  // instructions in the basic block are vectorized, so we can use them to
+  // construct the PHI.
+  PhiVector PHIsToFix;
+
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *Inst = it;
 
     switch (Inst->getOpcode()) {
-      case Instruction::PHI:
       case Instruction::Br:
         // Nothing to do for PHIs and BR, since we already took care of the
         // loop control flow instructions.
         continue;
-
+      case Instruction::PHI:{
+        PHINode* P = cast<PHINode>(Inst);
+        // Special handling for the induction var.
+        if (OldInduction == Inst)
+          continue;
+        // This is phase I of vectorizing PHIs.
+        // This has to be a reduction variable.
+        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
+        Type *VecTy = VectorType::get(Inst->getType(), VF);
+        WidenMap[Inst] = Builder->CreatePHI(VecTy, 2, "vec.phi");
+        PHIsToFix.push_back(P);
+        continue;
+      }
       case Instruction::Add:
       case Instruction::FAdd:
       case Instruction::Sub:
@@ -557,15 +667,17 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         // Use this vector value for all users of the original instruction.
-        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B);
         break;
       }
       case Instruction::Select: {
         // Widen selects.
+        // TODO: If the selector is loop invariant we can issue a select
+        // instruction with a scalar condition.
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         Value *C = getVectorValue(Inst->getOperand(2));
-        WidenMap[Inst] = Builder.CreateSelect(A, B, C);
+        WidenMap[Inst] = Builder->CreateSelect(A, B, C);
         break;
       }
 
@@ -577,9 +689,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         if (FCmp)
-          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B);
         else
-          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B);
         break;
       }
 
@@ -600,10 +712,10 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         unsigned NumOperands = Gep->getNumOperands();
         Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
+        Ptr = Builder->Insert(Gep2);
+        Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo());
         Value *Val = getVectorValue(SI->getValueOperand());
-        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
+        Builder->CreateStore(Val, Ptr)->setAlignment(Alignment);
         break;
       }
       case Instruction::Load: {
@@ -624,9 +736,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         unsigned NumOperands = Gep->getNumOperands();
         Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
-        LI = Builder.CreateLoad(Ptr);
+        Ptr = Builder->Insert(Gep2);
+        Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo());
+        LI = Builder->CreateLoad(Ptr);
         LI->setAlignment(Alignment);
         // Use this vector value for all users of the load.
         WidenMap[Inst] = LI;
@@ -648,7 +760,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         CastInst *CI = dyn_cast<CastInst>(Inst);
         Value *A = getVectorValue(Inst->getOperand(0));
         Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+        WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy);
         break;
       }
 
@@ -658,6 +770,102 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         break;
     }// end of switch.
   }// end of for_each instr.
+
+  // At this point every instruction in the original loop is widended to
+  // a vector form. We are almost done. Now, we need to fix the PHI nodes
+  // that we vectorized. The PHI nodes are currently empty because we did
+  // not want to introduce cycles. Notice that the remaining PHI nodes
+  // that we need to fix are reduction variables.
+
+  // Create the 'reduced' values for each of the induction vars.
+  // The reduced values are the vector values that we scalarize and combine
+  // after the loop is finished.
+  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+       it != e; ++it) {
+    PHINode *RdxPhi = *it;
+    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
+    assert(RdxPhi && "Unable to recover vectorized PHI");
+
+    // Find the reduction variable.
+    assert(Legal->getReductionVars()->count(RdxPhi) &&
+           "Unable to find the reduction variable");
+    LoopVectorizationLegality::ReductionPair ReductionVar =
+      (*Legal->getReductionVars())[RdxPhi];
+
+    // This is the vector-clone of the value that leaves the loop.
+    Value *VectorExit = getVectorValue(ReductionVar.first);
+    Type *VecTy = VectorExit->getType();
+
+    // This is the kind of reduction.
+    LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second;
+    // Find the reduction identity variable.
+    // Zero for addition. One for Multiplication.
+    unsigned IdentitySclr =
+      (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1);
+    Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType());
+
+    // Fix the vector-loop phi.
+    // We created the induction variable so we know that the
+    // preheader is the first entry.
+    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
+    VecRdxPhi->addIncoming(Identity, VecPreheader);
+    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
+    VecRdxPhi->addIncoming(Val, LoopVectorBody);
+
+    // Before each round, move the insertion point right between
+    // the PHIs and the values we are going to write.
+    // This allows us to write both PHINodes and the extractelement
+    // instructions.
+    Builder->SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+
+    // This PHINode contains the vectorized reduction variable, or
+    // the identity vector, if we bypass the vector loop.
+    PHINode *NewPhi = Builder->CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+    NewPhi->addIncoming(Identity, LoopBypassBlock);
+    NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody);
+
+    // Extract the first scalar.
+    Value *Scalar0 =
+      Builder->CreateExtractElement(NewPhi, Builder->getInt32(0));
+    // Extract and sum the remaining vector elements.
+    for (unsigned i=1; i < VF; ++i) {
+      Value *Scalar1 =
+        Builder->CreateExtractElement(NewPhi, Builder->getInt32(i));
+      if (RdxKind == LoopVectorizationLegality::IntegerAdd) {
+        Scalar0 = Builder->CreateAdd(Scalar0, Scalar1);
+      } else {
+        Scalar0 = Builder->CreateMul(Scalar0, Scalar1);
+      }
+    }
+
+    // Now, we need to fix the users of the reduction variable
+    // inside and outside of the scalar remainder loop.
+    // We know that the loop is in LCSSA form. We need to update the
+    // PHI nodes in the exit blocks.
+    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+      if (!LCSSAPhi) continue;
+
+      // All PHINodes need to have a single entry edge, or two if we already fixed them.
+      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+      // We found our reduction value exit-PHI. Update it with the incoming bypass edge.
+      if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) {
+        // Add an edge coming from the bypass.
+        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        break;
+      }
+    }// end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first);
+  }// end of for each redux variable.
 }
 
 void SingleBlockLoopVectorizer::cleanup() {
@@ -710,31 +918,35 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
   ValueVector Reads;
   ValueVector Writes;
 
-  SmallPtrSet<Value*, 16> AnalyzedPtrs;
-  unsigned NumPhis = 0;
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *I = it;
 
     PHINode *Phi = dyn_cast<PHINode>(I);
     if (Phi) {
-      NumPhis++;
+      // This should not happen because the loop should be normalized.
+      if (Phi->getNumIncomingValues() != 2) {
+        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+        return false;
+      }
       // We only look at integer phi nodes.
       if (!Phi->getType()->isIntegerTy()) {
         DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
         return false;
       }
-
-      // If we found an induction variable.
-      if (NumPhis > 1) {
-        DEBUG(dbgs() << "LV: Found more than one PHI.\n");
-        return false;
+      if (AddReductionVar(Phi, IntegerAdd)) {
+        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+        continue;
       }
-
-      // This should not happen because the loop should be normalized.
-      if (Phi->getNumIncomingValues() != 2) {
-        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+      if (AddReductionVar(Phi, IntegerMult)) {
+        DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (Induction) {
+        DEBUG(dbgs() << "LV: Found too many PHIs.\n");
         return false;
       }
+      // Found the induction variable.
+      Induction = Phi;
 
       // Check that the PHI is consecutive and starts at zero.
       const SCEV *PhiScev = SE->getSCEV(Phi);
@@ -751,7 +963,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
         return false;
       }
-    }
+    }// end of PHI handling
 
     // If this is a load, record its pointer. If it is not a load, abort.
     // Notice that we don't handle function calls that read or write.
@@ -764,8 +976,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
       }
 
       Value* Ptr = Ld->getPointerOperand();
-      if (AnalyzedPtrs.insert(Ptr))
-        GetUnderlyingObjects(Ptr, Reads, DL);
+      GetUnderlyingObjects(Ptr, Reads, DL);
     }
 
     // Record store pointers. Abort on all other instructions that write to
@@ -779,8 +990,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
       }
 
       Value* Ptr = St->getPointerOperand();
-      if (AnalyzedPtrs.insert(Ptr))
-        GetUnderlyingObjects(St->getPointerOperand(), Writes, DL);
+      GetUnderlyingObjects(Ptr, Writes, DL);
     }
 
     // We still don't handle functions.
@@ -797,21 +1007,26 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
       DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
       return false;
     }
-    //Check that all of the users of the loop are inside the BB.
-    for (Value::use_iterator it = I->use_begin(), e = I->use_end();
-         it != e; ++it) {
-      Instruction *U = cast<Instruction>(*it);
-      BasicBlock *Parent = U->getParent();
-      if (Parent != &BB) {
-        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
-        return false;
-      }
+
+    // Reduction instructions are allowed to have exit users.
+    // All other instructions must not have external users.
+    if (!AllowedExit.count(I))
+      //Check that all of the users of the loop are inside the BB.
+      for (Value::use_iterator it = I->use_begin(), e = I->use_end();
+           it != e; ++it) {
+        Instruction *U = cast<Instruction>(*it);
+        // This user may be a reduction exit value.
+        BasicBlock *Parent = U->getParent();
+        if (Parent != &BB) {
+          DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+          return false;
+        }
     }
   } // next instr.
 
-  if (NumPhis != 1) {
-    DEBUG(dbgs() << "LV: Did not find a Phi node.\n");
-    return false;
+  if (!Induction) {
+      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
+      return false;
   }
 
   // Check that the underlying objects of the reads and writes are either
@@ -866,6 +1081,110 @@ bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
   return A->hasNoAliasAttr();
 }
 
+bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
+                                                    ReductionKind Kind) {
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // Find the possible incoming reduction variable.
+  BasicBlock *BB = Phi->getParent();
+  int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
+  int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
+  Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
+
+  // We must have a constant that starts the reduction.
+  if (!isReductionConstant(RdxStart, Kind))
+    return false;
+
+  // ExitInstruction is the single value which is used outside the loop.
+  // We only allow for a single reduction value to be used outside the loop.
+  // This includes users of the reduction, variables (which form a cycle
+  // which ends in the phi node).
+  Instruction *ExitInstruction = 0;
+
+  // Iter is our iterator. We start with the PHI node and scan for all of the
+  // users of this instruction. All users must be instructions which can be
+  // used as reduction variables (such as ADD). We may have a single
+  // out-of-block user. They cycle must end with the original PHI.
+  // Also, we can't have multiple block-local users.
+  Instruction *Iter = Phi;
+  while (true) {
+    // Any reduction instr must be of one of the allowed kinds.
+    if (!isReductionInstr(Iter, Kind))
+      return false;
+
+    // Did we found a user inside this block ?
+    bool FoundInBlockUser = false;
+    // Did we reach the initial PHI node ?
+    bool FoundStartPHI = false;
+    // For each of the *users* of iter.
+    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
+         it != e; ++it) {
+      Instruction *U = cast<Instruction>(*it);
+      // We already know that the PHI is a user.
+      if (U == Phi) {
+        FoundStartPHI = true;
+        continue;
+      }
+      // Check if we found the exit user.
+      BasicBlock *Parent = U->getParent();
+      if (Parent != BB) {
+        // We must have a single exit instruction.
+        if (ExitInstruction != 0)
+          return false;
+        ExitInstruction = Iter;
+      }
+      // We can't have multiple inside users.
+      if (FoundInBlockUser)
+        return false;
+      FoundInBlockUser = true;
+      Iter = U;
+    }
+
+    // We found a reduction var if we have reached the original
+    // phi node and we only have a single instruction with out-of-loop
+    // users.
+   if (FoundStartPHI && ExitInstruction) {
+     // This instruction is allowed to have out-of-loop users.
+     AllowedExit.insert(ExitInstruction);
+     // Mark this as a reduction var.
+     Reductions[Phi] = std::make_pair(ExitInstruction, Kind);
+     return true;
+   }
+  }
+}
+
+bool
+LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(V);
+  if (!CI)
+    return false;
+  if (Kind == IntegerMult && CI->isOne())
+    return true;
+  if (Kind == IntegerAdd && CI->isZero())
+    return true;
+  return false;
+}
+
+bool
+LoopVectorizationLegality::isReductionInstr(Instruction *I,
+                                            ReductionKind Kind) {
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::PHI:
+      // possibly.
+      return true;
+    case Instruction::Add:
+    case Instruction::Sub:
+      return Kind == IntegerAdd;
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+      return Kind == IntegerMult;
+    }
+}
+
 } // namespace
 
 char LoopVectorize::ID = 0;
@@ -880,6 +1199,5 @@ namespace llvm {
   Pass *createLoopVectorizePass() {
     return new LoopVectorize();
   }
-
 }
 
-- 
cgit v1.1


From 4ade92b6145a0d298b3c5ae1af79aca2598a327d Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 19 Oct 2012 23:27:19 +0000
Subject: LoopVectorize: Keep the IRBuilder on the stack.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166354 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 75 ++++++++++++++----------------
 1 file changed, 35 insertions(+), 40 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 309b60f..c0b709a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -71,11 +71,7 @@ public:
   SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
                             LPPassManager *Lpm, unsigned VecWidth):
   Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
-   Builder(0), Induction(0), OldInduction(0) { }
-
-  ~SingleBlockLoopVectorizer() {
-    delete Builder;
-  }
+  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *Legal) {
@@ -140,7 +136,7 @@ private:
   unsigned VF;
 
   // The builder that we use
-  IRBuilder<> *Builder;
+  IRBuilder<> Builder;
 
   // --- Vectorization state ---
 
@@ -305,9 +301,9 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
   Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
   Value *UndefVal = UndefValue::get(VTy);
   // Insert the value into a new vector.
-  Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero);
+  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
   // Broadcast the scalar into all locations in the vector.
-  Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros,
+  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
                                              "broadcast");
   // We are accessing the induction variable. Make sure to promote the
   // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
@@ -333,7 +329,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
   // Add the consecutive indices to the vector value.
   Constant *Cv = ConstantVector::get(Indices);
   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
-  return Builder->CreateAdd(Val, Cv, "induction");
+  return Builder.CreateAdd(Val, Cv, "induction");
 }
 
 
@@ -440,18 +436,18 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
       Value *Op = Params[op];
       // Param is a vector. Need to extract the right lane.
       if (Op->getType()->isVectorTy())
-        Op = Builder->CreateExtractElement(Op, Builder->getInt32(i));
+        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
       Cloned->setOperand(op, Op);
     }
 
     // Place the cloned scalar in the new loop.
-    Builder->Insert(Cloned);
+    Builder.Insert(Cloned);
 
     // If the original scalar returns a value we need to place it in a vector
     // so that future users will be able to use it.
     if (!IsVoidRetTy)
-      VecResults = Builder->CreateInsertElement(VecResults, Cloned,
-                                               Builder->getInt32(i));
+      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
+                                               Builder.getInt32(i));
   }
 
   if (!IsVoidRetTy)
@@ -504,8 +500,8 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
                                                   "middle.block");
   BasicBlock *ScalarPH =
-          MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                       "scalar.preheader");
+    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+                                 "scalar.preheader");
   // Find the induction variable.
   BasicBlock *OldBasicBlock = Orig->getHeader();
   OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
@@ -514,11 +510,10 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
-  Builder = new IRBuilder<>(VecBody);
-  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
   // Generate the induction variable.
-  Induction = Builder->CreatePHI(IdxTy, 2, "index");
+  Induction = Builder.CreatePHI(IdxTy, 2, "index");
   Constant *Zero = ConstantInt::get(IdxTy, 0);
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
@@ -569,12 +564,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   MiddleBlock->getTerminator()->eraseFromParent();
 
   // Create i+1 and fill the PHINode.
-  Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next");
+  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
   Induction->addIncoming(Zero, VectorPH);
   Induction->addIncoming(NextIdx, VecBody);
   // Create the compare.
-  Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown);
-  Builder->CreateCondBr(ICmp, MiddleBlock, VecBody);
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
+  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
 
   // Now we have two terminators. Remove the old one from the block.
   VecBody->getTerminator()->eraseFromParent();
@@ -584,7 +579,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
 
   // Get ready to start creating new instructions into the vectorized body.
-  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
   // Register the new loop.
   Loop* Lp = new Loop();
@@ -640,7 +635,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // This has to be a reduction variable.
         assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
         Type *VecTy = VectorType::get(Inst->getType(), VF);
-        WidenMap[Inst] = Builder->CreatePHI(VecTy, 2, "vec.phi");
+        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
         PHIsToFix.push_back(P);
         continue;
       }
@@ -667,7 +662,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         // Use this vector value for all users of the original instruction.
-        WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
         break;
       }
       case Instruction::Select: {
@@ -677,7 +672,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         Value *C = getVectorValue(Inst->getOperand(2));
-        WidenMap[Inst] = Builder->CreateSelect(A, B, C);
+        WidenMap[Inst] = Builder.CreateSelect(A, B, C);
         break;
       }
 
@@ -689,9 +684,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         if (FCmp)
-          WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
         else
-          WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
         break;
       }
 
@@ -712,10 +707,10 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         unsigned NumOperands = Gep->getNumOperands();
         Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder->Insert(Gep2);
-        Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo());
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
         Value *Val = getVectorValue(SI->getValueOperand());
-        Builder->CreateStore(Val, Ptr)->setAlignment(Alignment);
+        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
         break;
       }
       case Instruction::Load: {
@@ -736,9 +731,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         unsigned NumOperands = Gep->getNumOperands();
         Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder->Insert(Gep2);
-        Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo());
-        LI = Builder->CreateLoad(Ptr);
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
+        LI = Builder.CreateLoad(Ptr);
         LI->setAlignment(Alignment);
         // Use this vector value for all users of the load.
         WidenMap[Inst] = LI;
@@ -760,7 +755,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         CastInst *CI = dyn_cast<CastInst>(Inst);
         Value *A = getVectorValue(Inst->getOperand(0));
         Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-        WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy);
+        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
         break;
       }
 
@@ -817,25 +812,25 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // the PHIs and the values we are going to write.
     // This allows us to write both PHINodes and the extractelement
     // instructions.
-    Builder->SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
 
     // This PHINode contains the vectorized reduction variable, or
     // the identity vector, if we bypass the vector loop.
-    PHINode *NewPhi = Builder->CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
     NewPhi->addIncoming(Identity, LoopBypassBlock);
     NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody);
 
     // Extract the first scalar.
     Value *Scalar0 =
-      Builder->CreateExtractElement(NewPhi, Builder->getInt32(0));
+      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
     // Extract and sum the remaining vector elements.
     for (unsigned i=1; i < VF; ++i) {
       Value *Scalar1 =
-        Builder->CreateExtractElement(NewPhi, Builder->getInt32(i));
+        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
       if (RdxKind == LoopVectorizationLegality::IntegerAdd) {
-        Scalar0 = Builder->CreateAdd(Scalar0, Scalar1);
+        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
       } else {
-        Scalar0 = Builder->CreateMul(Scalar0, Scalar1);
+        Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
       }
     }
 
-- 
cgit v1.1


From e6748f91eaa69b91cc5c4081a45e013a251c2726 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Oct 2012 04:59:06 +0000
Subject: Vectorizer: refactor the memory checks to a new function. No
 functionality change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166366 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 84 ++++++++++++++++++------------
 1 file changed, 51 insertions(+), 33 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c0b709a..7866fcf 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -202,6 +202,12 @@ private:
   /// and we only need to check individual instructions.
   bool canVectorizeBlock(BasicBlock &BB);
 
+  /// When we vectorize loops we may change the order in which
+  /// we read and write from memory. This method checks if it is
+  /// legal to vectorize the code, considering only memory constrains.
+  /// Returns true if BB is vectorizable
+  bool canVectorizeMemory(BasicBlock &BB;)
+
   // Check if a pointer value is known to be disjoint.
   // Example: Alloca, Global, NoAlias.
   bool isIdentifiedSafeObject(Value* Val);
@@ -908,11 +914,7 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
 }
 
 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
-  // Holds the read and write pointers that we find.
-  typedef SmallVector<Value*, 10> ValueVector;
-  ValueVector Reads;
-  ValueVector Writes;
-
+  // Scan the instructions in the block and look for hazards.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *I = it;
 
@@ -960,34 +962,6 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
       }
     }// end of PHI handling
 
-    // If this is a load, record its pointer. If it is not a load, abort.
-    // Notice that we don't handle function calls that read or write.
-    if (I->mayReadFromMemory()) {
-      LoadInst *Ld = dyn_cast<LoadInst>(I);
-      if (!Ld) return false;
-      if (!Ld->isSimple()) {
-        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
-        return false;
-      }
-
-      Value* Ptr = Ld->getPointerOperand();
-      GetUnderlyingObjects(Ptr, Reads, DL);
-    }
-
-    // Record store pointers. Abort on all other instructions that write to
-    // memory.
-    if (I->mayWriteToMemory()) {
-      StoreInst *St = dyn_cast<StoreInst>(I);
-      if (!St) return false;
-      if (!St->isSimple()) {
-        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
-        return false;
-      }
-
-      Value* Ptr = St->getPointerOperand();
-      GetUnderlyingObjects(Ptr, Writes, DL);
-    }
-
     // We still don't handle functions.
     CallInst *CI = dyn_cast<CallInst>(I);
     if (CI) {
@@ -1024,6 +998,50 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
       return false;
   }
 
+  // If the memory dependencies do not prevent us from
+  // vectorizing, then vectorize.
+  return canVectorizeMemory(BB);
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
+  // Holds the read and write pointers that we find.
+  typedef SmallVector<Value*, 10> ValueVector;
+  ValueVector Reads;
+  ValueVector Writes;
+
+  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+    Instruction *I = it;
+
+    // If this is a load, record its pointer. If it is not a load, abort.
+    // Notice that we don't handle function calls that read or write.
+    if (I->mayReadFromMemory()) {
+      LoadInst *Ld = dyn_cast<LoadInst>(I);
+      if (!Ld) return false;
+      if (!Ld->isSimple()) {
+        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+        return false;
+      }
+
+      Value* Ptr = Ld->getPointerOperand();
+      GetUnderlyingObjects(Ptr, Reads, DL);
+    }
+
+    // Record store pointers. Abort on all other instructions that write to
+    // memory.
+    if (I->mayWriteToMemory()) {
+      StoreInst *St = dyn_cast<StoreInst>(I);
+      if (!St) return false;
+      if (!St->isSimple()) {
+        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+        return false;
+      }
+
+      Value* Ptr = St->getPointerOperand();
+      GetUnderlyingObjects(Ptr, Writes, DL);
+    }
+  } // next instr.
+
+
   // Check that the underlying objects of the reads and writes are either
   // disjoint memory locations, or that they are no-alias arguments.
   ValueVector::iterator r, re, w, we;
-- 
cgit v1.1


From 71a148223907504c78f90f835131d5e8921011ad Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Oct 2012 05:03:27 +0000
Subject: Fix a typo

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166367 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7866fcf..968d471 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -206,7 +206,7 @@ private:
   /// we read and write from memory. This method checks if it is
   /// legal to vectorize the code, considering only memory constrains.
   /// Returns true if BB is vectorizable
-  bool canVectorizeMemory(BasicBlock &BB;)
+  bool canVectorizeMemory(BasicBlock &BB);
 
   // Check if a pointer value is known to be disjoint.
   // Example: Alloca, Global, NoAlias.
-- 
cgit v1.1


From bf8772ed2cc89a495e2692919331d7a03e76d791 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Oct 2012 08:26:33 +0000
Subject: Vectorize: teach cavVectorizeMemory to distinguish between A[i]+=x
 and A[B[i]]+=x. If the pointer is consecutive then it is safe to read and
 write. If the pointer is non-loop-consecutive then it is unsafe to vectorize
 it because we may hit an ordering issue.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166371 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 211 +++++++++++++++++++----------
 1 file changed, 137 insertions(+), 74 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 968d471..c11c66f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -76,7 +76,7 @@ public:
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *Legal) {
     ///Create a new empty loop. Unlink the old loop and connect the new one.
-    createEmptyLoop();
+    createEmptyLoop(Legal);
     /// Widen each instruction in the old loop to a new one in the new loop.
     /// Use the Legality module to find the induction and reduction variables.
    vectorizeLoop(Legal);
@@ -86,7 +86,7 @@ public:
 
 private:
   /// Create an empty loop, based on the loop ranges of the old loop.
-  void createEmptyLoop();
+  void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop(LoopVectorizationLegality *Legal);
   /// Insert the new loop to the loop hierarchy and pass manager.
@@ -107,10 +107,6 @@ private:
   /// for each element in the vector. Starting from zero.
   Value *getConsecutiveVector(Value* Val);
 
-  /// Check that the GEP operands are all uniform except for the last index
-  /// which has to be the induction variable.
-  bool isConsecutiveGep(GetElementPtrInst *Gep);
-
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
   /// When we widen (vectorize) values we place them in the map. If the values
@@ -196,6 +192,10 @@ public:
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
 
+  /// Check that the GEP operands are all uniform except for the last index
+  /// which has to be the induction variable.
+  bool isConsecutiveGep(Value *Ptr);
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -221,6 +221,8 @@ private:
   /// Returns true if the instruction I can be a reduction variable of type
   /// 'Kind'.
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns True, if 'Phi' is an induction variable.
+  bool isInductionVariable(PHINode *Phi);
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -338,8 +340,8 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
   return Builder.CreateAdd(Val, Cv, "induction");
 }
 
-
-bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
+bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
   if (!Gep)
     return false;
 
@@ -348,7 +350,7 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
 
   // Check that all of the gep indices are uniform except for the last.
   for (unsigned i = 0; i < NumOperands - 1; ++i)
-    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig))
+    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
       return false;
 
   // We can emit wide load/stores only of the last index is the induction
@@ -460,7 +462,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
     WidenMap[Instr] = VecResults;
 }
 
-void SingleBlockLoopVectorizer::createEmptyLoop() {
+void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -510,7 +512,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
                                  "scalar.preheader");
   // Find the induction variable.
   BasicBlock *OldBasicBlock = Orig->getHeader();
-  OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
+  OldInduction = Legal->getInduction();
   assert(OldInduction && "We must have a single phi node.");
   Type *IdxTy = OldInduction->getType();
 
@@ -637,7 +639,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // Special handling for the induction var.
         if (OldInduction == Inst)
           continue;
-        // This is phase I of vectorizing PHIs.
+        // This is phase one of vectorizing PHIs.
         // This has to be a reduction variable.
         assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
         Type *VecTy = VectorType::get(Inst->getType(), VF);
@@ -704,7 +706,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         unsigned Alignment = SI->getAlignment();
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
         // This store does not use GEPs.
-        if (!isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
           break;
         }
@@ -728,7 +730,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
         // We don't have a gep. Scalarize the load.
-        if (!isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
           break;
         }
@@ -930,6 +932,16 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
         return false;
       }
+
+      if (isInductionVariable(Phi)) {
+        if (Induction) {
+          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+          return false;
+        }
+        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
+        Induction = Phi;
+        continue;
+      }
       if (AddReductionVar(Phi, IntegerAdd)) {
         DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
         continue;
@@ -938,28 +950,6 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
         continue;
       }
-      if (Induction) {
-        DEBUG(dbgs() << "LV: Found too many PHIs.\n");
-        return false;
-      }
-      // Found the induction variable.
-      Induction = Phi;
-
-      // Check that the PHI is consecutive and starts at zero.
-      const SCEV *PhiScev = SE->getSCEV(Phi);
-      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
-      if (!AR) {
-        DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
-        return false;
-      }
-
-      const SCEV *Step = AR->getStepRecurrence(*SE);
-      const SCEV *Start = AR->getStart();
-
-      if (!Step->isOne() || !Start->isZero()) {
-        DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
-        return false;
-      }
     }// end of PHI handling
 
     // We still don't handle functions.
@@ -1004,16 +994,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
-  // Holds the read and write pointers that we find.
-  typedef SmallVector<Value*, 10> ValueVector;
-  ValueVector Reads;
-  ValueVector Writes;
+  typedef SmallVector<Value*, 16> ValueVector;
+  typedef SmallPtrSet<Value*, 16> ValueSet;
+  // Holds the Load and Store *instructions*.
+  ValueVector Loads;
+  ValueVector Stores;
 
+  // Scan the BB and collect legal loads and stores.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *I = it;
 
-    // If this is a load, record its pointer. If it is not a load, abort.
-    // Notice that we don't handle function calls that read or write.
+    // If this is a load, save it. If this instruction can read from memory
+    // but is not a load, then we quit. Notice that we don't handle function
+    // calls that read or write.
     if (I->mayReadFromMemory()) {
       LoadInst *Ld = dyn_cast<LoadInst>(I);
       if (!Ld) return false;
@@ -1021,13 +1014,11 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found a non-simple load.\n");
         return false;
       }
-
-      Value* Ptr = Ld->getPointerOperand();
-      GetUnderlyingObjects(Ptr, Reads, DL);
+      Loads.push_back(Ld);
+      continue;
     }
 
-    // Record store pointers. Abort on all other instructions that write to
-    // memory.
+    // Save store instructions. Abort if other instructions write to memory.
     if (I->mayWriteToMemory()) {
       StoreInst *St = dyn_cast<StoreInst>(I);
       if (!St) return false;
@@ -1035,45 +1026,99 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found a non-simple store.\n");
         return false;
       }
-
-      Value* Ptr = St->getPointerOperand();
-      GetUnderlyingObjects(Ptr, Writes, DL);
+      Stores.push_back(St);
     }
   } // next instr.
 
+  // Now we have two lists that hold the loads and the stores.
+  // Next, we find the pointers that they use.
 
-  // Check that the underlying objects of the reads and writes are either
-  // disjoint memory locations, or that they are no-alias arguments.
-  ValueVector::iterator r, re, w, we;
-  for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
-    if (!isIdentifiedSafeObject(*r)) {
-      DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n");
-      return false;
-    }
+  // Check if we see any stores. If there are no stores, then we don't
+  // care if the pointers are *restrict*.
+  if (!Stores.size()) {
+        DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+        return true;
   }
 
-  for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
-    if (!isIdentifiedSafeObject(*w)) {
-      DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n");
-      return false;
-    }
+  // Holds the read and read-write *pointers* that we find.
+  ValueVector Reads;
+  ValueVector ReadWrites;
+
+  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
+  // multiple times on the same object. If the ptr is accessed twice, once
+  // for read and once for write, it will only appear once (on the write
+  // list). This is okay, since we are going to check for conflicts between
+  // writes and between reads and writes, but not between reads and reads.
+  ValueSet Seen;
+
+  ValueVector::iterator I, IE;
+  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
+    StoreInst *ST = dyn_cast<StoreInst>(*I);
+    assert(ST && "Bad StoreInst");
+    Value* Ptr = ST->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to
+    // the read-write list. At this phase it is only a 'write' list.
+    if (Seen.insert(Ptr))
+      ReadWrites.push_back(Ptr);
   }
 
-  // Check that there are no multiple write locations to the same pointer.
-  SmallPtrSet<Value*, 8> WritePointerSet;
-  for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
-    if (!WritePointerSet.insert(*w)) {
-      DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n");
-      return false;
+  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
+    LoadInst *LD = dyn_cast<LoadInst>(*I);
+    assert(LD && "Bad LoadInst");
+    Value* Ptr = LD->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to the
+    // read list. If we *did* see it before, then it is already in
+    // the read-write list. This allows us to vectorize expressions
+    // such as A[i] += x;  Because the address of A[i] is a read-write
+    // pointer. This only works if the index of A[i] is consecutive.
+    // If the address of i is unknown (for example A[B[i]]) then we may
+    // read a few words, modify, and write a few words, and some of the
+    // words may be written to the same address.
+    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+      Reads.push_back(Ptr);
+  }
+
+  // Now that the pointers are in two lists (Reads and ReadWrites), we
+  // can check that there are no conflicts between each of the writes and
+  // between the writes to the reads.
+  ValueSet WriteObjects;
+  ValueVector TempObjects;
+
+  // Check that the read-writes do not conflict with other read-write
+  // pointers.
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
+    GetUnderlyingObjects(*I, TempObjects, DL);
+    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
+         it != e; ++it) {
+      if (!isIdentifiedSafeObject(*it)) {
+        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
+        return false;
+      }
+      if (!WriteObjects.insert(*it)) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **it <<"\n");
+        return false;
+      }
     }
+    TempObjects.clear();
   }
 
-  // Check that the reads and the writes are disjoint.
-  for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
-    if (WritePointerSet.count(*r)) {
-      DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n");
-      return false;
+  /// Check that the reads don't conflict with the read-writes.
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
+    GetUnderlyingObjects(*I, TempObjects, DL);
+    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
+         it != e; ++it) {
+      if (!isIdentifiedSafeObject(*it)) {
+        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
+        return false;
+      }
+      if (WriteObjects.count(*it)) {
+        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
+              << **it <<"\n");
+        return false;
+      }
     }
+    TempObjects.clear();
   }
 
   // All is okay.
@@ -1198,6 +1243,24 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     }
 }
 
+bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  // Check that the PHI is consecutive and starts at zero.
+  const SCEV *PhiScev = SE->getSCEV(Phi);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR) {
+    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    return false;
+  }
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+  const SCEV *Start = AR->getStart();
+
+  if (!Step->isOne() || !Start->isZero()) {
+    DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
+    return false;
+  }
+  return true;
+}
+
 } // namespace
 
 char LoopVectorize::ID = 0;
-- 
cgit v1.1


From ccaccfa8bf070b169847109513610cf41abfd1f3 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 20 Oct 2012 20:45:01 +0000
Subject: Fix an infinite loop in the loop-vectorizer.

PR14134.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166379 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c11c66f..027fe06 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1175,6 +1175,12 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     bool FoundInBlockUser = false;
     // Did we reach the initial PHI node ?
     bool FoundStartPHI = false;
+
+    // If the instruction has no users then this is a broken
+    // chain and can't be a reduction variable.
+    if (Iter->use_begin() == Iter->use_end())
+      return false;
+
     // For each of the *users* of iter.
     for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
          it != e; ++it) {
-- 
cgit v1.1


From 5a418ba5f5a6498a25d5eacb0f876d9f358c977b Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 21 Oct 2012 02:38:01 +0000
Subject: Vectorizer: fix a bug in the classification of induction/reduction
 phis.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166384 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 027fe06..76936d5 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -950,6 +950,9 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
         continue;
       }
+
+      DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+      return false;
     }// end of PHI handling
 
     // We still don't handle functions.
-- 
cgit v1.1


From f01cad69c1ab1b025c524d1ac16060b2cc0f4668 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 21 Oct 2012 04:04:25 +0000
Subject: Document change. Describe the pass and some papers that inspired the
 design of the pass.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166386 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 76936d5..f32b66d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7,10 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a simple loop vectorizer. We currently only support single block
-// loops. We have a very simple and restrictive legality check: we need to read
-// and write from disjoint memory locations. We still don't have a cost model.
-// We do support integer reductions.
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR. Legalization of the IR is done
+// in the codegen. However, the vectorizes uses (will use) the codegen
+// interfaces to generate IR that is likely to result in an optimal binary.
+//
+// The loop vectorizer combines consecutive loop iteration into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
 //
 // This pass has three parts:
 // 1. The main loop pass that drives the different parts.
@@ -18,6 +22,16 @@
 //    of the vectorization.
 // 3. SingleBlockLoopVectorizer - A helper class that performs the actual
 //    widening of instructions.
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// Other ideas/concepts are from:
+//  A. Zaks and D. Nuzman. Autovectorization in GCC—two years later.
 //
 //===----------------------------------------------------------------------===//
 #define LV_NAME "loop-vectorize"
-- 
cgit v1.1


From c84787262942a400a8de6539ee79f4de72d3f872 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 21 Oct 2012 05:52:51 +0000
Subject: Add support for reduction variables that do not start at zero. This
 is important for nested-loop reductions such as :

In the innermost loop, the induction variable does not start with zero:

for (i = 0 .. n)
 for (j = 0 .. m)
  sum += ...



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166387 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 118 ++++++++++++++++-------------
 1 file changed, 67 insertions(+), 51 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f32b66d..5a79c33 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -179,20 +179,36 @@ public:
   TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
 
   /// This represents the kinds of reductions that we support.
+  /// We use the enum values to hold the 'identity' value for
+  /// each operand. This value does not change the result if applied.
   enum ReductionKind {
-    IntegerAdd, /// Sum of numbers.
-    IntegerMult, /// Product of numbers.
-    NoReduction /// Not a reduction.
+    NoReduction = -1, /// Not a reduction.
+    IntegerAdd  = 0,  /// Sum of numbers.
+    IntegerMult = 1  /// Product of numbers.
   };
 
-  // Holds a pairing of reduction instruction and the reduction kind.
-  typedef std::pair<Instruction*, ReductionKind> ReductionPair;
+  /// This POD struct holds information about reduction variables.
+  struct ReductionDescriptor {
+    // Default C'tor
+    ReductionDescriptor():
+    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
+
+    // C'tor.
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
+    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+    // The starting value of the reduction.
+    // It does not have to be zero!
+    Value *StartValue;
+    // The instruction who's value is used outside the loop.
+    Instruction *LoopExitInstr;
+    // The kind of the reduction.
+    ReductionKind Kind;
+  };
 
-  /// ReductionList contains the reduction variables
-  /// as well as a single EXIT (from the block) value and the kind of
-  /// reduction variable..
-  /// Notice that the EXIT instruction can also be the PHI itself.
-  typedef DenseMap<PHINode*, ReductionPair> ReductionList;
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
   /// Returns the maximum vectorization factor that we *can* use to vectorize
   /// this loop. This does not mean that it is profitable to vectorize this
@@ -229,9 +245,6 @@ private:
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
-  /// Checks if a constant matches the reduction kind.
-  /// Sums starts with zero. Products start at one.
-  bool isReductionConstant(Value *V, ReductionKind Kind);
   /// Returns true if the instruction I can be a reduction variable of type
   /// 'Kind'.
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
@@ -628,6 +641,8 @@ void
 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   typedef SmallVector<PHINode*, 4> PhiVector;
   BasicBlock &BB = *Orig->getHeader();
+  Constant *Zero = ConstantInt::get(
+    IntegerType::getInt32Ty(BB.getContext()), 0);
 
   // In order to support reduction variables we need to be able to vectorize
   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
@@ -803,29 +818,42 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
     assert(RdxPhi && "Unable to recover vectorized PHI");
 
-    // Find the reduction variable.
+    // Find the reduction variable descriptor.
     assert(Legal->getReductionVars()->count(RdxPhi) &&
            "Unable to find the reduction variable");
-    LoopVectorizationLegality::ReductionPair ReductionVar =
+    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
       (*Legal->getReductionVars())[RdxPhi];
 
+    // We need to generate a reduction vector from the incoming scalar.
+    // To do so, we need to generate the 'identity' vector and overide
+    // one of the elements with the incoming scalar reduction. We need
+    // to do it in the vector-loop preheader.
+    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+
     // This is the vector-clone of the value that leaves the loop.
-    Value *VectorExit = getVectorValue(ReductionVar.first);
+    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
     Type *VecTy = VectorExit->getType();
 
-    // This is the kind of reduction.
-    LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second;
-    // Find the reduction identity variable.
-    // Zero for addition. One for Multiplication.
-    unsigned IdentitySclr =
-      (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1);
-    Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType());
+    // Find the reduction identity variable. The value of the enum is the
+    // identity. Zero for addition. One for Multiplication.
+    unsigned IdentitySclr =  RdxDesc.Kind;
+    Constant *Identity = getUniformVector(IdentitySclr,
+                                          VecTy->getScalarType());
+
+    // This vector is the Identity vector where the first element is the
+    // incoming scalar reduction.
+    Value *VectorStart = Builder.CreateInsertElement(Identity,
+                                                    RdxDesc.StartValue, Zero);
+
 
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
     // preheader is the first entry.
     BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
-    VecRdxPhi->addIncoming(Identity, VecPreheader);
+
+    // Reductions do not have to start at zero. They can start with
+    // any loop invariant values.
+    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
     unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
     Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
     VecRdxPhi->addIncoming(Val, LoopVectorBody);
@@ -837,10 +865,10 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
 
     // This PHINode contains the vectorized reduction variable, or
-    // the identity vector, if we bypass the vector loop.
+    // the initial value vector, if we bypass the vector loop.
     PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-    NewPhi->addIncoming(Identity, LoopBypassBlock);
-    NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody);
+    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
+    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
 
     // Extract the first scalar.
     Value *Scalar0 =
@@ -849,7 +877,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     for (unsigned i=1; i < VF; ++i) {
       Value *Scalar1 =
         Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
-      if (RdxKind == LoopVectorizationLegality::IntegerAdd) {
+      if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
         Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
       } else {
         Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
@@ -865,11 +893,13 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
       if (!LCSSAPhi) continue;
 
-      // All PHINodes need to have a single entry edge, or two if we already fixed them.
+      // All PHINodes need to have a single entry edge, or two if
+      // we already fixed them.
       assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
 
-      // We found our reduction value exit-PHI. Update it with the incoming bypass edge.
-      if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) {
+      // We found our reduction value exit-PHI. Update it with the
+      // incoming bypass edge.
+      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
         // Add an edge coming from the bypass.
         LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
         break;
@@ -881,7 +911,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
-    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first);
+    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
 }
 
@@ -1157,7 +1187,7 @@ bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
 }
 
 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
-                                                    ReductionKind Kind) {
+                                                ReductionKind Kind) {
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
@@ -1167,10 +1197,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
   Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
 
-  // We must have a constant that starts the reduction.
-  if (!isReductionConstant(RdxStart, Kind))
-    return false;
-
   // ExitInstruction is the single value which is used outside the loop.
   // We only allow for a single reduction value to be used outside the loop.
   // This includes users of the reduction, variables (which form a cycle
@@ -1228,26 +1254,16 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
    if (FoundStartPHI && ExitInstruction) {
      // This instruction is allowed to have out-of-loop users.
      AllowedExit.insert(ExitInstruction);
-     // Mark this as a reduction var.
-     Reductions[Phi] = std::make_pair(ExitInstruction, Kind);
+
+     // Save the description of this reduction variable.
+     ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+     Reductions[Phi] = RD;
      return true;
    }
   }
 }
 
 bool
-LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) {
-  ConstantInt *CI = dyn_cast<ConstantInt>(V);
-  if (!CI)
-    return false;
-  if (Kind == IntegerMult && CI->isOne())
-    return true;
-  if (Kind == IntegerAdd && CI->isZero())
-    return true;
-  return false;
-}
-
-bool
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
                                             ReductionKind Kind) {
     switch (I->getOpcode()) {
-- 
cgit v1.1


From bb950854acbb5966875763eaae7ab58e48e4f5a9 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 21 Oct 2012 06:49:10 +0000
Subject: Fix a bug in the vectorization of wide load/store operations. We used
 a SCEV to detect that A[X] is consecutive. We assumed that X was the
 induction variable. But X can be any expression that uses the induction for
 example: X = i + 2;

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166388 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5a79c33..1b6011b 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -740,10 +740,15 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
           break;
         }
 
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+
         // Create the new GEP with the new induction variable.
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        unsigned NumOperands = Gep->getNumOperands();
-        Gep2->setOperand(NumOperands - 1, Induction);
+        Gep2->setOperand(NumOperands - 1, LastIndex);
         Ptr = Builder.Insert(Gep2);
         Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
         Value *Val = getVectorValue(SI->getValueOperand());
@@ -764,10 +769,15 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
           break;
         }
 
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+
         // Create the new GEP with the new induction variable.
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        unsigned NumOperands = Gep->getNumOperands();
-        Gep2->setOperand(NumOperands - 1, Induction);
+        Gep2->setOperand(NumOperands - 1, LastIndex);
         Ptr = Builder.Insert(Gep2);
         Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
         LI = Builder.CreateLoad(Ptr);
-- 
cgit v1.1


From b75209cb66ab5d7a155af3cffe0e552e69820e2c Mon Sep 17 00:00:00 2001
From: Jakub Staszak <kubastaszak@gmail.com>
Date: Sun, 21 Oct 2012 15:29:19 +0000
Subject: Simplify code. No functionality change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166392 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1b6011b..7efb003 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1186,14 +1186,13 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
 /// marked with the NoAlias attribute.
 bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
   assert(Val && "Invalid value");
-  if (dyn_cast<GlobalValue>(Val))
+  if (isa<GlobalValue>(Val))
     return true;
-  if (dyn_cast<AllocaInst>(Val))
+  if (isa<AllocaInst>(Val))
     return true;
-  Argument *A = dyn_cast<Argument>(Val);
-  if (!A)
-    return false;
-  return A->hasNoAliasAttr();
+  if (Argument *A = dyn_cast<Argument>(Val))
+    return A->hasNoAliasAttr();
+  return false;
 }
 
 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
-- 
cgit v1.1


From ffe5008fd3013451fd3cfe8dd878c9a1d4abad21 Mon Sep 17 00:00:00 2001
From: Jakub Staszak <kubastaszak@gmail.com>
Date: Sun, 21 Oct 2012 15:36:03 +0000
Subject: Simplify code. No functionality change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166393 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7efb003..dfa2909 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1230,7 +1230,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 
     // If the instruction has no users then this is a broken
     // chain and can't be a reduction variable.
-    if (Iter->use_begin() == Iter->use_end())
+    if (Iter->use_empty())
       return false;
 
     // For each of the *users* of iter.
-- 
cgit v1.1


From 5bb5a75b1e47443ba9b07a53e6e967d29108bab3 Mon Sep 17 00:00:00 2001
From: Anders Carlsson <andersca@mac.com>
Date: Sun, 21 Oct 2012 16:26:35 +0000
Subject: Avoid an extra hash lookup when inserting a value into the widen map.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166395 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index dfa2909..a041169 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -398,13 +398,13 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   // If we saved a vectorized copy of V, use it.
-  ValueMap::iterator it = WidenMap.find(V);
-  if (it != WidenMap.end())
-     return it->second;
+  Value *&MapEntry = WidenMap[V];
+  if (MapEntry)
+    return MapEntry;
 
   // Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
-  WidenMap[V] = B;
+  MapEntry = B;
   return B;
 }
 
-- 
cgit v1.1


From e0fa403e23ab5610471c2dbe5c032327ee1ed091 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 22 Oct 2012 03:52:53 +0000
Subject: Update the loop vectorizer docs.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166408 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 55 +++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 17 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a041169..0fbf3e0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -31,7 +31,7 @@
 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
 //
 // Other ideas/concepts are from:
-//  A. Zaks and D. Nuzman. Autovectorization in GCC—two years later.
+//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
 //
 //===----------------------------------------------------------------------===//
 #define LV_NAME "loop-vectorize"
@@ -75,10 +75,20 @@ namespace {
 // Forward declaration.
 class LoopVectorizationLegality;
 
-/// Vectorize a simple loop. This class performs the widening of simple single
-/// basic block loops into vectors. It does not perform any
-/// vectorization-legality checks, and just does it.  It widens the vectors
-/// to a given vectorization factor (VF).
+/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+///   counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+///   instructions.
+/// SingleBlockLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The SingleBlockLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
 class SingleBlockLoopVectorizer {
 public:
   /// Ctor.
@@ -169,10 +179,19 @@ private:
   ValueMap WidenMap;
 };
 
-/// Perform the vectorization legality check. This class does not look at the
-/// profitability of vectorization, only the legality. At the moment the checks
-/// are very simple and focus on single basic block loops with a constant
-/// iteration count and no reductions.
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+///   will change the order of memory accesses in a way that will change the
+///   correctness of the program.
+/// * Scalars checks - The code in canVectorizeBlock checks for a number
+///   of different conditions, such as the availability of a single induction
+///   variable, that all types are supported and vectorize-able, etc.
+/// This code reflects the capabilities of SingleBlockLoopVectorizer.
+/// This class is also used by SingleBlockLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
   LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
@@ -222,8 +241,10 @@ public:
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
 
-  /// Check that the GEP operands are all uniform except for the last index
-  /// which has to be the induction variable.
+  /// Check if the pointer returned by this GEP is consecutive
+  /// when the index is vectorized. This happens when the last
+  /// index of the GEP is consecutive, like the induction variable.
+  /// This check allows us to vectorize A[idx] into a wide load/store.
   bool isConsecutiveGep(Value *Ptr);
 
 private:
@@ -281,8 +302,7 @@ struct LoopVectorize : public LoopPass {
   LoopInfo *LI;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
-
-    // Only vectorize innermost loops.
+    // We only vectorize innermost loops.
     if (!L->empty())
       return false;
 
@@ -297,7 +317,8 @@ struct LoopVectorize : public LoopPass {
     LoopVectorizationLegality LVL(L, SE, DL);
     unsigned MaxVF = LVL.getLoopMaxVF();
 
-    // Check that we can vectorize using the chosen vectorization width.
+    // Check that we can vectorize this loop using the chosen vectorization
+    // width.
     if (MaxVF < DefaultVectorizationFactor) {
       DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
       return false;
@@ -305,7 +326,7 @@ struct LoopVectorize : public LoopPass {
 
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
 
-    // If we decided that is is *legal* to vectorizer the loop. Do it.
+    // If we decided that it is *legal* to vectorizer the loop then do it.
     SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
     LB.vectorize(&LVL);
 
@@ -461,7 +482,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   if (!IsVoidRetTy)
     VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
 
-  // For each scalar that we create.
+  // For each scalar that we create:
   for (unsigned i = 0; i < VF; ++i) {
     Instruction *Cloned = Instr->clone();
     if (!IsVoidRetTy)
@@ -495,7 +516,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-   [  ] <-- vector loop bypass.
+    [ ] <-- vector loop bypass.
   /  |
  /   v
 |   [ ]     <-- vector pre header.
-- 
cgit v1.1


From 565048e78a152aa69b66422037b67c0f627bbfec Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 22 Oct 2012 04:38:00 +0000
Subject: Vectorizer: optimize the generation of selects. If the condition is
 uniform, generate a scalar-cond select (i1 as selector).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166409 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0fbf3e0..59e8e5e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -725,12 +725,22 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       }
       case Instruction::Select: {
         // Widen selects.
-        // TODO: If the selector is loop invariant we can issue a select
-        // instruction with a scalar condition.
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Value *B = getVectorValue(Inst->getOperand(1));
-        Value *C = getVectorValue(Inst->getOperand(2));
-        WidenMap[Inst] = Builder.CreateSelect(A, B, C);
+        // If the selector is loop invariant we can create a select
+        // instruction with a scalar condition. Otherwise, use vector-select.
+        Value *Cond = Inst->getOperand(0);
+        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), Orig);
+
+        // The condition can be loop invariant  but still defined inside the
+        // loop. This means that we can't just use the original 'cond' value.
+        // We have to take the 'vectorized' value and pick the first lane.
+        // Instcombine will make this a no-op.
+        Cond = getVectorValue(Cond);
+        if (InvariantCond)
+          Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
+
+        Value *Op0 = getVectorValue(Inst->getOperand(1));
+        Value *Op1 = getVectorValue(Inst->getOperand(2));
+        WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
         break;
       }
 
-- 
cgit v1.1


From 5f7d81022398f332b222552f5d980c4e3f1c542c Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 22 Oct 2012 04:53:05 +0000
Subject: Rename a variable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166410 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 59e8e5e..6fbf342 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -92,9 +92,9 @@ class LoopVectorizationLegality;
 class SingleBlockLoopVectorizer {
 public:
   /// Ctor.
-  SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
+  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
                             LPPassManager *Lpm, unsigned VecWidth):
-  Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
+  OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
   Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
@@ -145,7 +145,7 @@ private:
   typedef DenseMap<Value*, Value*> ValueMap;
 
   /// The original loop.
-  Loop *Orig;
+  Loop *OrigLoop;
   // Scev analysis to use.
   ScalarEvolution *SE;
   // Loop Info.
@@ -541,11 +541,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
    */
 
   // This is the original scalar-loop preheader.
-  BasicBlock *BypassBlock = Orig->getLoopPreheader();
-  BasicBlock *ExitBlock = Orig->getExitBlock();
+  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
-  assert(Orig->getNumBlocks() == 1 && "Invalid loop");
+  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
 
   BasicBlock *VectorPH =
@@ -559,7 +559,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
     MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
                                  "scalar.preheader");
   // Find the induction variable.
-  BasicBlock *OldBasicBlock = Orig->getHeader();
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
   OldInduction = Legal->getInduction();
   assert(OldInduction && "We must have a single phi node.");
   Type *IdxTy = OldInduction->getType();
@@ -574,7 +574,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
   // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader());
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
   // Get the total trip count from the count by adding 1.
@@ -639,11 +639,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
 
   // Register the new loop.
   Loop* Lp = new Loop();
-  LPM->insertLoop(Lp, Orig->getParentLoop());
+  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
 
   Lp->addBasicBlockToLoop(VecBody, LI->getBase());
 
-  Loop *ParentLoop = Orig->getParentLoop();
+  Loop *ParentLoop = OrigLoop->getParentLoop();
   if (ParentLoop) {
     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
@@ -661,7 +661,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
 void
 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   typedef SmallVector<PHINode*, 4> PhiVector;
-  BasicBlock &BB = *Orig->getHeader();
+  BasicBlock &BB = *OrigLoop->getHeader();
   Constant *Zero = ConstantInt::get(
     IntegerType::getInt32Ty(BB.getContext()), 0);
 
@@ -728,7 +728,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // If the selector is loop invariant we can create a select
         // instruction with a scalar condition. Otherwise, use vector-select.
         Value *Cond = Inst->getOperand(0);
-        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), Orig);
+        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
 
         // The condition can be loop invariant  but still defined inside the
         // loop. This means that we can't just use the original 'cond' value.
@@ -958,7 +958,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
 void SingleBlockLoopVectorizer::cleanup() {
   // The original basic block.
-  SE->forgetLoop(Orig);
+  SE->forgetLoop(OrigLoop);
 }
 
 unsigned LoopVectorizationLegality::getLoopMaxVF() {
-- 
cgit v1.1


From 782090aa02484d7d1d335e3a7e209f1132aaa1a7 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 22 Oct 2012 18:27:56 +0000
Subject: Don't crash if the load/store pointer is not a GEP.

Fix by Shivarama Rao <Shivarama.Rao@amd.com>



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166427 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6fbf342..f944d9b 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -389,7 +389,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
 }
 
 bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
-  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
   if (!Gep)
     return false;
 
-- 
cgit v1.1


From 6457001f31713ff26a707ddef616341052b1b296 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 23 Oct 2012 18:44:18 +0000
Subject: Use the AliasAnalysis isIdentifiedObj because it also understands
 mallocs and c++ news.

PR14158.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166491 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f944d9b..bead392 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -259,10 +259,6 @@ private:
   /// Returns true if BB is vectorizable
   bool canVectorizeMemory(BasicBlock &BB);
 
-  // Check if a pointer value is known to be disjoint.
-  // Example: Alloca, Global, NoAlias.
-  bool isIdentifiedSafeObject(Value* Val);
-
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
@@ -1178,7 +1174,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     GetUnderlyingObjects(*I, TempObjects, DL);
     for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
          it != e; ++it) {
-      if (!isIdentifiedSafeObject(*it)) {
+      if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
         return false;
       }
@@ -1196,7 +1192,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     GetUnderlyingObjects(*I, TempObjects, DL);
     for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
          it != e; ++it) {
-      if (!isIdentifiedSafeObject(*it)) {
+      if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
         return false;
       }
@@ -1213,19 +1209,6 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   return true;
 }
 
-/// Checks if the value is a Global variable or if it is an Arguments
-/// marked with the NoAlias attribute.
-bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
-  assert(Val && "Invalid value");
-  if (isa<GlobalValue>(Val))
-    return true;
-  if (isa<AllocaInst>(Val))
-    return true;
-  if (Argument *A = dyn_cast<Argument>(Val))
-    return A->hasNoAliasAttr();
-  return false;
-}
-
 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
                                                 ReductionKind Kind) {
   if (Phi->getNumIncomingValues() != 2)
-- 
cgit v1.1


From 2f87640b86315beab8a5671cc23f524e59c58bd3 Mon Sep 17 00:00:00 2001
From: Micah Villmow <villmow@gmail.com>
Date: Wed, 24 Oct 2012 17:20:04 +0000
Subject: Delete a directory that wasn't supposed to be checked in yet.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166591 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index bead392..4cd171a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -777,7 +777,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         Gep2->setOperand(NumOperands - 1, LastIndex);
         Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
+        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo(Ptr->getType()));
         Value *Val = getVectorValue(SI->getValueOperand());
         Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
         break;
@@ -806,7 +806,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         Gep2->setOperand(NumOperands - 1, LastIndex);
         Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
+        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo(Ptr->getType()));
         LI = Builder.CreateLoad(Ptr);
         LI->setAlignment(Alignment);
         // Use this vector value for all users of the load.
-- 
cgit v1.1


From b8bce928f4ffdf50eff69334f3e25b27848536b6 Mon Sep 17 00:00:00 2001
From: Micah Villmow <villmow@gmail.com>
Date: Wed, 24 Oct 2012 17:25:11 +0000
Subject: Back out r166591, not sure why this made it through since I cancelled
 the command. Bleh, sorry about this!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166596 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4cd171a..bead392 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -777,7 +777,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         Gep2->setOperand(NumOperands - 1, LastIndex);
         Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo(Ptr->getType()));
+        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
         Value *Val = getVectorValue(SI->getValueOperand());
         Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
         break;
@@ -806,7 +806,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
         Gep2->setOperand(NumOperands - 1, LastIndex);
         Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo(Ptr->getType()));
+        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
         LI = Builder.CreateLoad(Ptr);
         LI->setAlignment(Alignment);
         // Use this vector value for all users of the load.
-- 
cgit v1.1


From 50bec6f8c494957b00dd225ddf580d3e0b97b871 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 24 Oct 2012 20:36:32 +0000
Subject: LoopVectorizer: Add a basic cost model which uses the VTTI interface.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166620 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 303 ++++++++++++++++++++++++++---
 1 file changed, 273 insertions(+), 30 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index bead392..6f6685b 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -18,10 +18,13 @@
 //
 // This pass has three parts:
 // 1. The main loop pass that drives the different parts.
-// 2. LoopVectorizationLegality - A helper class that checks for the legality
+// 2. LoopVectorizationLegality - A unit that checks for the legality
 //    of the vectorization.
-// 3. SingleBlockLoopVectorizer - A helper class that performs the actual
+// 3. SingleBlockLoopVectorizer - A unit that performs the actual
 //    widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+//    of vectorization. It decides on the optimal vector width, which
+//    can be one, if vectorization is not profitable.
 //===----------------------------------------------------------------------===//
 //
 // The reduction-variable vectorization is based on the paper:
@@ -51,13 +54,14 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/TargetTransformInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -67,13 +71,14 @@
 using namespace llvm;
 
 static cl::opt<unsigned>
-DefaultVectorizationFactor("default-loop-vectorize-width",
-                          cl::init(4), cl::Hidden,
-                          cl::desc("Set the default loop vectorization width"));
+VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
+          cl::desc("Set the default vectorization width. Zero is autoselect."));
+
 namespace {
 
-// Forward declaration.
+// Forward declarations.
 class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
 
 /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
@@ -229,11 +234,10 @@ public:
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
-  /// Returns the maximum vectorization factor that we *can* use to vectorize
-  /// this loop. This does not mean that it is profitable to vectorize this
-  /// loop, only that it is legal to do so. This may be a large number. We
-  /// can vectorize to any SIMD width below this number.
-  unsigned getLoopMaxVF();
+  /// Returns true if it is legal to vectorize this loop.
+  /// This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so.
+  bool canVectorize();
 
   /// Returns the Induction variable.
   PHINode *getInduction() {return Induction;}
@@ -286,6 +290,49 @@ private:
   SmallPtrSet<Value*, 4> AllowedExit;
 };
 
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because
+/// of a number of reasons. In this class we mainly attempt to predict
+/// the expected speedup/slowdowns due to the supported instruction set.
+/// We use the VectorTargetTransformInfo to query the different backends
+/// for the cost of different operations.
+class LoopVectorizationCostModel {
+public:
+  /// C'tor.
+  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
+                             LoopVectorizationLegality *Leg,
+                             const VectorTargetTransformInfo *Vtti):
+  TheLoop(Lp), SE(Se), DL(Dl), Legal(Leg), VTTI(Vtti) { }
+
+  /// Returns the most profitable vectorization factor for the loop that is
+  /// smaller or equal to the VF argument. This method checks every power
+  /// of two up to VF.
+  unsigned findBestVectorizationFactor(unsigned VF = 4);
+
+private:
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width.
+  unsigned expectedCost(unsigned VF);
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  /// DataLayout analysis.
+  DataLayout *DL;
+  /// Vectorization legality.
+  LoopVectorizationLegality *Legal;
+  /// Vector target information.
+  const VectorTargetTransformInfo *VTTI;
+};
+
 struct LoopVectorize : public LoopPass {
   static char ID; // Pass identification, replacement for typeid
 
@@ -296,6 +343,7 @@ struct LoopVectorize : public LoopPass {
   ScalarEvolution *SE;
   DataLayout *DL;
   LoopInfo *LI;
+  TargetTransformInfo *TTI;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -305,25 +353,42 @@ struct LoopVectorize : public LoopPass {
     SE = &getAnalysis<ScalarEvolution>();
     DL = getAnalysisIfAvailable<DataLayout>();
     LI = &getAnalysis<LoopInfo>();
+    TTI = getAnalysisIfAvailable<TargetTransformInfo>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
     // Check if it is legal to vectorize the loop.
     LoopVectorizationLegality LVL(L, SE, DL);
-    unsigned MaxVF = LVL.getLoopMaxVF();
-
-    // Check that we can vectorize this loop using the chosen vectorization
-    // width.
-    if (MaxVF < DefaultVectorizationFactor) {
-      DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
+    if (!LVL.canVectorize()) {
+      DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
+    // Select the preffered vectorization factor.
+    unsigned VF = 1;
+    if (VectorizationFactor == 0) {
+      const VectorTargetTransformInfo *VTTI = 0;
+      if (TTI)
+        VTTI = TTI->getVectorTargetTransformInfo();
+      // Use the cost model.
+      LoopVectorizationCostModel CM(L, SE, DL, &LVL, VTTI);
+      VF = CM.findBestVectorizationFactor();
+
+      if (VF == 1) {
+        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+        return false;
+      }
+
+    } else {
+      // Use the user command flag.
+      VF = VectorizationFactor;
+    }
+
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
+    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -656,6 +721,13 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
 
 void
 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should be also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
   typedef SmallVector<PHINode*, 4> PhiVector;
   BasicBlock &BB = *OrigLoop->getHeader();
   Constant *Zero = ConstantInt::get(
@@ -957,18 +1029,18 @@ void SingleBlockLoopVectorizer::cleanup() {
   SE->forgetLoop(OrigLoop);
 }
 
-unsigned LoopVectorizationLegality::getLoopMaxVF() {
+bool LoopVectorizationLegality::canVectorize() {
   if (!TheLoop->getLoopPreheader()) {
     assert(false && "No preheader!!");
     DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
-    return  1;
+    return  false;
   }
 
   // We can only vectorize single basic block loops.
   unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1) {
     DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
-    return 1;
+    return false;
   }
 
   // We need to have a loop header.
@@ -978,22 +1050,22 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
   // Go over each instruction and look at memory deps.
   if (!canVectorizeBlock(*BB)) {
     DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
-    return 1;
+    return false;
   }
 
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
   if (ExitCount == SE->getCouldNotCompute()) {
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
-    return 1;
+    return false;
   }
 
   DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
 
   // Okay! We can vectorize. At this point we don't have any other mem analysis
-  // which may limit our maximum vectorization factor, so just return the
-  // maximum SIMD size.
-  return DefaultVectorizationFactor;
+  // which may limit our maximum vectorization factor, so just return true with
+  // no restrictions.
+  return true;
 }
 
 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
@@ -1323,6 +1395,177 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   return true;
 }
 
+unsigned
+LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
+  if (!VTTI) {
+    DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
+    return 1;
+  }
+
+  float Cost = expectedCost(1);
+  unsigned Width = 1;
+  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+  for (unsigned i=2; i <= VF; i*=2) {
+    // Notice that the vector loop needs to be executed less times, so
+    // we need to divide the cost of the vector loops by the width of
+    // the vector elements.
+    float VectorCost = expectedCost(i) / (float)i;
+    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+          (int)VectorCost << ".\n");
+    if (VectorCost < Cost) {
+      Cost = VectorCost;
+      Width = i;
+    }
+  }
+
+  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+  return Width;
+}
+
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
+  // We can only estimate the cost of single basic block loops.
+  assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
+
+  BasicBlock *BB = TheLoop->getHeader();
+  unsigned Cost = 0;
+
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    Instruction *Inst = it;
+    Cost += getInstructionCost(Inst, VF);
+  }
+
+  // Return the cost divided by VF, because we will be executing
+  // less iterations of the vector form.
+  return Cost;
+}
+
+unsigned
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+  assert(VTTI && "Invalid vector target transformation info");
+  switch (I->getOpcode()) {
+    case Instruction::Br: {
+      return VTTI->getInstrCost(I->getOpcode());
+    }
+    case Instruction::PHI:
+      // PHIs are handled the same as the binary instructions below.
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      Type *VTy = VectorType::get(I->getType(), VF);
+          return VTTI->getInstrCost(I->getOpcode(), VTy);
+    }
+    case Instruction::Select: {
+      SelectInst *SI = cast<SelectInst>(I);
+      Type *VTy = VectorType::get(I->getType(), VF);
+      const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+      bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+      Type *CondTy = SI->getCondition()->getType();
+        if (ScalarCond)
+          CondTy = VectorType::get(CondTy, VF);
+
+      return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy);
+    }
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF);
+      return VTTI->getInstrCost(I->getOpcode(), VTy);
+    }
+    case Instruction::Store: {
+      StoreInst *SI = cast<StoreInst>(I);
+      Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+
+      // Scalarized stores.
+      if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
+        unsigned Cost = 0;
+        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
+        // The cost of extracting from the vector value.
+        Cost += VF * ExtCost;
+        // The cost of the scalar stores.
+        Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+        return Cost;
+      }
+
+      // Wide stores.
+      return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(),
+                                   SI->getPointerAddressSpace());
+    }
+    case Instruction::Load: {
+      LoadInst *LI = cast<LoadInst>(I);
+      Type *VTy = VectorType::get(I->getType(), VF);
+
+      // Scalarized loads.
+      if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
+        unsigned Cost = 0;
+        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+        // The cost of inserting the loaded value into the result vector.
+        Cost += VF * InCost;
+        // The cost of the scalar stores.
+        Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+        return Cost;
+      }
+
+      // Wide loads.
+      return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(),
+                                   LI->getPointerAddressSpace());
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF);
+      Type *DstTy = VectorType::get(I->getType(), VF);
+      return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy);
+    }
+    default: {
+      // We are scalarizing the instruction. Return the cost of the scalar
+      // instruction, plus the cost of insert and extract into vector
+      // elements, times the vector width.
+      unsigned Cost = 0;
+      Type *Ty = I->getType();
+
+      if (!Ty->isVoidTy()) {
+        Type *VTy = VectorType::get(Ty, VF);
+        unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
+        Cost += VF * (InsCost + ExtCost);
+      }
+
+      /// We don't have any information on the scalar instruction, but maybe
+      /// the target has.
+      /// TODO: This may be a target-specific intrinsic.
+      /// Need to add API for that.
+      Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty);
+
+      return Cost;
+    }
+  }// end of switch.
+}
+
+
 } // namespace
 
 char LoopVectorize::ID = 0;
-- 
cgit v1.1


From 7c6de1d6e53c47f2943874ab9d63dddfd19b503d Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 24 Oct 2012 20:58:40 +0000
Subject: whitespace

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166622 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6f6685b..35f49e4 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1468,7 +1468,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::Or:
     case Instruction::Xor: {
       Type *VTy = VectorType::get(I->getType(), VF);
-          return VTTI->getInstrCost(I->getOpcode(), VTy);
+      return VTTI->getInstrCost(I->getOpcode(), VTy);
     }
     case Instruction::Select: {
       SelectInst *SI = cast<SelectInst>(I);
@@ -1476,8 +1476,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
       bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
       Type *CondTy = SI->getCondition()->getType();
-        if (ScalarCond)
-          CondTy = VectorType::get(CondTy, VF);
+      if (ScalarCond)
+        CondTy = VectorType::get(CondTy, VF);
 
       return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy);
     }
-- 
cgit v1.1


From 2652c50f74bc4a874c6a2e4b34ff2d52d479183f Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 24 Oct 2012 23:47:38 +0000
Subject: Implement a basic cost model for vector and scalar instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166642 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 49 +++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 35f49e4..483b9fc 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -300,10 +300,10 @@ private:
 class LoopVectorizationCostModel {
 public:
   /// C'tor.
-  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
+  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
                              LoopVectorizationLegality *Leg,
                              const VectorTargetTransformInfo *Vtti):
-  TheLoop(Lp), SE(Se), DL(Dl), Legal(Leg), VTTI(Vtti) { }
+  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
 
   /// Returns the most profitable vectorization factor for the loop that is
   /// smaller or equal to the VF argument. This method checks every power
@@ -325,8 +325,7 @@ private:
   Loop *TheLoop;
   /// Scev analysis.
   ScalarEvolution *SE;
-  /// DataLayout analysis.
-  DataLayout *DL;
+
   /// Vectorization legality.
   LoopVectorizationLegality *Legal;
   /// Vector target information.
@@ -372,7 +371,7 @@ struct LoopVectorize : public LoopPass {
       if (TTI)
         VTTI = TTI->getVectorTargetTransformInfo();
       // Use the cost model.
-      LoopVectorizationCostModel CM(L, SE, DL, &LVL, VTTI);
+      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
       VF = CM.findBestVectorizationFactor();
 
       if (VF == 1) {
@@ -1432,11 +1431,12 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     Instruction *Inst = it;
-    Cost += getInstructionCost(Inst, VF);
+    unsigned C = getInstructionCost(Inst, VF);
+    Cost += C;
+    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
+          " For instruction: "<< *Inst << "\n");
   }
 
-  // Return the cost divided by VF, because we will be executing
-  // less iterations of the vector form.
   return Cost;
 }
 
@@ -1444,11 +1444,13 @@ unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   assert(VTTI && "Invalid vector target transformation info");
   switch (I->getOpcode()) {
+    case Instruction::GetElementPtr:
+      return 0;
     case Instruction::Br: {
       return VTTI->getInstrCost(I->getOpcode());
     }
     case Instruction::PHI:
-      // PHIs are handled the same as the binary instructions below.
+      return 0;
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -1493,11 +1495,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       // Scalarized stores.
       if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
         unsigned Cost = 0;
-        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
-        // The cost of extracting from the vector value.
-        Cost += VF * ExtCost;
+        if (VF != 1) {
+          unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                                VTy);
+          // The cost of extracting from the value vector and pointer vector.
+          Cost += VF * (ExtCost * 2);
+        }
         // The cost of the scalar stores.
-        Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           VTy->getScalarType(),
+                                           SI->getAlignment(),
+                                           SI->getPointerAddressSpace());
         return Cost;
       }
 
@@ -1512,11 +1520,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       // Scalarized loads.
       if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
         unsigned Cost = 0;
-        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
-        // The cost of inserting the loaded value into the result vector.
-        Cost += VF * InCost;
+        if (VF != 1) {
+          unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+          unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy);
+
+          // The cost of inserting the loaded value into the result vector, and
+          // extracting from a vector of pointers.
+          Cost += VF * (InCost + ExCost);
+        }
         // The cost of the scalar stores.
-        Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(),
+                                           LI->getAlignment(),
+                                           LI->getPointerAddressSpace());
         return Cost;
       }
 
-- 
cgit v1.1


From 8dbac7b529cfb73bcd0ceef514e5c1d247cf3baa Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 25 Oct 2012 00:08:41 +0000
Subject: Add support for additional reduction variables: AND, OR, XOR.

Patch by Paul Redmond <paul.redmond@intel.com>.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166649 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 49 +++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 7 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 483b9fc..423c7a4 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -208,7 +208,10 @@ public:
   enum ReductionKind {
     NoReduction = -1, /// Not a reduction.
     IntegerAdd  = 0,  /// Sum of numbers.
-    IntegerMult = 1  /// Product of numbers.
+    IntegerMult = 1,  /// Product of numbers.
+    IntegerOr   = 2,  /// Bitwise or logical OR of numbers.
+    IntegerAnd  = 3,  /// Bitwise or logical AND of numbers.
+    IntegerXor  = 4   /// Bitwise or logical XOR of numbers.
   };
 
   /// This POD struct holds information about reduction variables.
@@ -981,14 +984,28 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // Extract the first scalar.
     Value *Scalar0 =
       Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
-    // Extract and sum the remaining vector elements.
+    // Extract and reduce the remaining vector elements.
     for (unsigned i=1; i < VF; ++i) {
       Value *Scalar1 =
         Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
-      if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
-        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
-      } else {
-        Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+      switch (RdxDesc.Kind) {
+        case LoopVectorizationLegality::IntegerAdd:
+          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerMult:
+          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerOr:
+          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerAnd:
+          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerXor:
+          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
+          break;
+        default:
+          llvm_unreachable("Unknown reduction operation");
       }
     }
 
@@ -1099,7 +1116,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         continue;
       }
       if (AddReductionVar(Phi, IntegerMult)) {
-        DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
+        DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerOr)) {
+        DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerAnd)) {
+        DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerXor)) {
+        DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
         continue;
       }
 
@@ -1373,6 +1402,12 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     case Instruction::UDiv:
     case Instruction::SDiv:
       return Kind == IntegerMult;
+    case Instruction::And:
+      return Kind == IntegerAnd;
+    case Instruction::Or:
+      return Kind == IntegerOr;
+    case Instruction::Xor:
+      return Kind == IntegerXor;
     }
 }
 
-- 
cgit v1.1


From 3ef9dfa6858e25015c3e36b2f1a0ba5ebdea80d2 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 25 Oct 2012 21:03:48 +0000
Subject: LoopVectorize: Teach the cost model to query scalar costs as scalar
 types and not vectors of 1.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166715 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 102 +++++++++++++++++------------
 1 file changed, 61 insertions(+), 41 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 423c7a4..e47baf8 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -324,6 +324,11 @@ private:
   /// width. Vector width of one means scalar.
   unsigned getInstructionCost(Instruction *I, unsigned VF);
 
+  /// A helper function for converting Scalar types to vector types.
+  /// If the incoming type is void, we return void. If the VF is 1, we return
+  /// the scalar type.
+  static Type* ToVectorTy(Type *Scalar, unsigned VF);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
@@ -1478,8 +1483,16 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   assert(VTTI && "Invalid vector target transformation info");
+
+  Type *RetTy = I->getType();
+  Type *VectorTy = ToVectorTy(RetTy, VF);
+
+  // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
     case Instruction::GetElementPtr:
+      // We mark this instruction as zero-cost because scalar GEPs are usually
+      // lowered to the intruction addressing mode. At the moment we don't
+      // generate vector geps.
       return 0;
     case Instruction::Br: {
       return VTTI->getInstrCost(I->getOpcode());
@@ -1504,74 +1517,76 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      Type *VTy = VectorType::get(I->getType(), VF);
-      return VTTI->getInstrCost(I->getOpcode(), VTy);
+      return VTTI->getInstrCost(I->getOpcode(), VectorTy);
     }
     case Instruction::Select: {
       SelectInst *SI = cast<SelectInst>(I);
-      Type *VTy = VectorType::get(I->getType(), VF);
       const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
       bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
       Type *CondTy = SI->getCondition()->getType();
       if (ScalarCond)
         CondTy = VectorType::get(CondTy, VF);
 
-      return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy);
+      return VTTI->getInstrCost(I->getOpcode(), VectorTy, CondTy);
     }
     case Instruction::ICmp:
     case Instruction::FCmp: {
-      Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF);
-      return VTTI->getInstrCost(I->getOpcode(), VTy);
+      Type *ValTy = I->getOperand(0)->getType();
+      VectorTy = ToVectorTy(ValTy, VF);
+      return VTTI->getInstrCost(I->getOpcode(), VectorTy);
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(I);
-      Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+      Type *ValTy = SI->getValueOperand()->getType();
+      VectorTy = ToVectorTy(ValTy, VF);
+
+      if (VF == 1)
+        return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
+                              SI->getAlignment(), SI->getPointerAddressSpace());
 
       // Scalarized stores.
       if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
         unsigned Cost = 0;
-        if (VF != 1) {
-          unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
-                                                VTy);
-          // The cost of extracting from the value vector and pointer vector.
-          Cost += VF * (ExtCost * 2);
-        }
+        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                              ValTy);
+        // The cost of extracting from the value vector.
+        Cost += VF * (ExtCost);
         // The cost of the scalar stores.
         Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                           VTy->getScalarType(),
+                                           ValTy->getScalarType(),
                                            SI->getAlignment(),
                                            SI->getPointerAddressSpace());
         return Cost;
       }
 
       // Wide stores.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(),
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
                                    SI->getPointerAddressSpace());
     }
     case Instruction::Load: {
       LoadInst *LI = cast<LoadInst>(I);
-      Type *VTy = VectorType::get(I->getType(), VF);
+
+      if (VF == 1)
+        return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
+                                     LI->getAlignment(),
+                                     LI->getPointerAddressSpace());
 
       // Scalarized loads.
       if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
         unsigned Cost = 0;
-        if (VF != 1) {
-          unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
-          unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy);
-
-          // The cost of inserting the loaded value into the result vector, and
-          // extracting from a vector of pointers.
-          Cost += VF * (InCost + ExCost);
-        }
+        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
+        // The cost of inserting the loaded value into the result vector.
+        Cost += VF * (InCost);
         // The cost of the scalar stores.
-        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(),
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           RetTy->getScalarType(),
                                            LI->getAlignment(),
                                            LI->getPointerAddressSpace());
         return Cost;
       }
 
       // Wide loads.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(),
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
                                    LI->getPointerAddressSpace());
     }
     case Instruction::ZExt:
@@ -1586,35 +1601,40 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
-      Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF);
-      Type *DstTy = VectorType::get(I->getType(), VF);
-      return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy);
+      Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+      return VTTI->getInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
     }
     default: {
       // We are scalarizing the instruction. Return the cost of the scalar
       // instruction, plus the cost of insert and extract into vector
       // elements, times the vector width.
       unsigned Cost = 0;
-      Type *Ty = I->getType();
 
-      if (!Ty->isVoidTy()) {
-        Type *VTy = VectorType::get(Ty, VF);
-        unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
-        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
-        Cost += VF * (InsCost + ExtCost);
-      }
+      bool IsVoid = RetTy->isVoidTy();
 
-      /// We don't have any information on the scalar instruction, but maybe
-      /// the target has.
-      /// TODO: This may be a target-specific intrinsic.
-      /// Need to add API for that.
-      Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty);
+      unsigned InsCost = (IsVoid ? 0 :
+                          VTTI->getInstrCost(Instruction::InsertElement,
+                                             VectorTy));
 
+      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                            VectorTy);
+
+      // The cost of inserting the results plus extracting each one of the
+      // operands.
+      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+
+      // The cost of executing VF copies of the scalar instruction.
+      Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
       return Cost;
     }
   }// end of switch.
 }
 
+Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
+  if (Scalar->isVoidTy() || VF == 1)
+    return Scalar;
+  return VectorType::get(Scalar, VF);
+}
 
 } // namespace
 
-- 
cgit v1.1


From a5a3a61c5fdcee972791d4e08441ba6edf131b88 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 26 Oct 2012 23:49:28 +0000
Subject: Refactor the VectorTargetTransformInfo interface.

Add getCostXXX calls for different families of opcodes, such as casts, arithmetic, cmp, etc.

Port the LoopVectorizer to the new API.

The LoopVectorizer now finds instructions which will remain uniform after vectorization. It uses this information when calculating the cost of these instructions.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166836 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 61 +++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 9 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e47baf8..1773812 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -108,7 +108,7 @@ public:
     createEmptyLoop(Legal);
     /// Widen each instruction in the old loop to a new one in the new loop.
     /// Use the Legality module to find the induction and reduction variables.
-   vectorizeLoop(Legal);
+    vectorizeLoop(Legal);
     // register the new loop.
     cleanup();
  }
@@ -254,6 +254,9 @@ public:
   /// This check allows us to vectorize A[idx] into a wide load/store.
   bool isConsecutiveGep(Value *Ptr);
 
+  /// Returns true if this instruction will remain scalar after vectorization.
+  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -291,6 +294,9 @@ private:
   /// Allowed outside users. This holds the reduction
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value*, 4> AllowedExit;
+  /// This set holds the variables which are known to be uniform after
+  /// vectorization.
+  SmallPtrSet<Instruction*, 4> Uniforms;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1177,9 +1183,40 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
       return false;
   }
 
-  // If the memory dependencies do not prevent us from
-  // vectorizing, then vectorize.
-  return canVectorizeMemory(BB);
+  // Don't vectorize if the memory dependencies do not allow vectorization.
+  if (!canVectorizeMemory(BB))
+    return false;
+
+  // We now know that the loop is vectorizable!
+  // Collect variables that will remain uniform after vectorization.
+  std::vector<Value*> Worklist;
+
+  // Start with the conditional branch and walk up the block.
+  Worklist.push_back(BB.getTerminator()->getOperand(0));
+
+  while (Worklist.size()) {
+    Instruction *I = dyn_cast<Instruction>(Worklist.back());
+    Worklist.pop_back();
+    // Look at instructions inside this block.
+    if (!I) continue;
+    if (I->getParent() != &BB) continue;
+
+    // Stop when reaching PHI nodes.
+    if (isa<PHINode>(I)) {
+      assert(I == Induction && "Found a uniform PHI that is not the induction");
+      break;
+    }
+
+    // This is a known uniform.
+    Uniforms.insert(I);
+
+    // Insert all operands.
+    for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
+      Worklist.push_back(I->getOperand(i));
+    }
+  }
+
+  return true;
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
@@ -1484,9 +1521,15 @@ unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   assert(VTTI && "Invalid vector target transformation info");
 
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (Legal->isUniformAfterVectorization(I))
+    VF = 1;
+
   Type *RetTy = I->getType();
   Type *VectorTy = ToVectorTy(RetTy, VF);
 
+
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
     case Instruction::GetElementPtr:
@@ -1495,7 +1538,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       // generate vector geps.
       return 0;
     case Instruction::Br: {
-      return VTTI->getInstrCost(I->getOpcode());
+      return VTTI->getCFInstrCost(I->getOpcode());
     }
     case Instruction::PHI:
       return 0;
@@ -1517,7 +1560,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      return VTTI->getInstrCost(I->getOpcode(), VectorTy);
+      return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
     }
     case Instruction::Select: {
       SelectInst *SI = cast<SelectInst>(I);
@@ -1527,13 +1570,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       if (ScalarCond)
         CondTy = VectorType::get(CondTy, VF);
 
-      return VTTI->getInstrCost(I->getOpcode(), VectorTy, CondTy);
+      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
     }
     case Instruction::ICmp:
     case Instruction::FCmp: {
       Type *ValTy = I->getOperand(0)->getType();
       VectorTy = ToVectorTy(ValTy, VF);
-      return VTTI->getInstrCost(I->getOpcode(), VectorTy);
+      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(I);
@@ -1602,7 +1645,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-      return VTTI->getInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+      return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
     }
     default: {
       // We are scalarizing the instruction. Return the cost of the scalar
-- 
cgit v1.1


From f065a8467785015336432e3e6e584798d8b48d8e Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 27 Oct 2012 04:11:32 +0000
Subject: 1. Fix a bug in getTypeConversion. When a *simple* type is split, we
 need to return the type of the split result. 2. Change the maximum
 vectorization width from 4 to 8. 3. A test for both.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166864 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1773812..be197db 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -317,7 +317,7 @@ public:
   /// Returns the most profitable vectorization factor for the loop that is
   /// smaller or equal to the VF argument. This method checks every power
   /// of two up to VF.
-  unsigned findBestVectorizationFactor(unsigned VF = 4);
+  unsigned findBestVectorizationFactor(unsigned VF = 8);
 
 private:
   /// Returns the expected execution cost. The unit of the cost does
-- 
cgit v1.1


From 369ff7b740c154dc4357c6d063f4957498a1e7b9 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 29 Oct 2012 21:52:38 +0000
Subject: LoopVectorize: Update and preserve the dominator tree info.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166970 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 46 ++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 9 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index be197db..dcdf0a3 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -98,8 +99,9 @@ class SingleBlockLoopVectorizer {
 public:
   /// Ctor.
   SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                            LPPassManager *Lpm, unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
+                            DominatorTree *dt, LPPassManager *Lpm,
+                            unsigned VecWidth):
+  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
   Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
@@ -110,7 +112,7 @@ public:
     /// Use the Legality module to find the induction and reduction variables.
     vectorizeLoop(Legal);
     // register the new loop.
-    cleanup();
+    updateAnalysis();
  }
 
 private:
@@ -119,7 +121,7 @@ private:
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop(LoopVectorizationLegality *Legal);
   /// Insert the new loop to the loop hierarchy and pass manager.
-  void cleanup();
+  void updateAnalysis();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
   /// of scalars.
@@ -155,6 +157,8 @@ private:
   ScalarEvolution *SE;
   // Loop Info.
   LoopInfo *LI;
+  // Dominator Tree.
+  DominatorTree *DT;
   // Loop Pass Manager;
   LPPassManager *LPM;
   // The vectorization factor to use.
@@ -165,6 +169,10 @@ private:
 
   // --- Vectorization state ---
 
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
   /// Middle Block between the vector and the scalar.
   BasicBlock *LoopMiddleBlock;
   ///The ExitBlock of the scalar loop.
@@ -357,6 +365,7 @@ struct LoopVectorize : public LoopPass {
   DataLayout *DL;
   LoopInfo *LI;
   TargetTransformInfo *TTI;
+  DominatorTree *DT;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -367,6 +376,7 @@ struct LoopVectorize : public LoopPass {
     DL = getAnalysisIfAvailable<DataLayout>();
     LI = &getAnalysis<LoopInfo>();
     TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+    DT = &getAnalysis<DominatorTree>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
@@ -401,7 +411,7 @@ struct LoopVectorize : public LoopPass {
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
+    SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -414,6 +424,9 @@ struct LoopVectorize : public LoopPass {
     AU.addRequiredID(LCSSAID);
     AU.addRequired<LoopInfo>();
     AU.addRequired<ScalarEvolution>();
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<DominatorTree>();
   }
 
 };
@@ -725,6 +738,8 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
   }
 
   // Save the state.
+  LoopVectorPreHeader = VectorPH;
+  LoopScalarPreHeader = ScalarPH;
   LoopMiddleBlock = MiddleBlock;
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
@@ -855,8 +870,8 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // The last index does not have to be the induction. It can be
         // consecutive and be a function of the index. For example A[I+1];
         unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
 
         // Create the new GEP with the new induction variable.
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
@@ -885,7 +900,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // consecutive and be a function of the index. For example A[I+1];
         unsigned NumOperands = Gep->getNumOperands();
         Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
 
         // Create the new GEP with the new induction variable.
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
@@ -1051,9 +1066,22 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   }// end of for each redux variable.
 }
 
-void SingleBlockLoopVectorizer::cleanup() {
+void SingleBlockLoopVectorizer::updateAnalysis() {
   // The original basic block.
   SE->forgetLoop(OrigLoop);
+
+  // Update the dominator tree information.
+  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+         "Entry does not dominate exit.");
+
+  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+
+  DEBUG(DT->verifyAnalysis());
 }
 
 bool LoopVectorizationLegality::canVectorize() {
-- 
cgit v1.1


From 803ea1c40aa059adf5b1baef0bf4510483565d07 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 30 Oct 2012 00:40:39 +0000
Subject: LoopVectorizer: change debug prints: Print the module identifier when
 deciding to vectorize. When deciding not to vectorize do not print the called
 function name because it can be null.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166989 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index dcdf0a3..0e6103e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -408,7 +408,9 @@ struct LoopVectorize : public LoopPass {
       VF = VectorizationFactor;
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
+          "\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
     SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
@@ -597,7 +599,8 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
     WidenMap[Instr] = VecResults;
 }
 
-void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+void
+SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -1178,8 +1181,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
     // We still don't handle functions.
     CallInst *CI = dyn_cast<CallInst>(I);
     if (CI) {
-      DEBUG(dbgs() << "LV: Found a call site:"<<
-            CI->getCalledFunction()->getName() << "\n");
+      DEBUG(dbgs() << "LV: Found a call site.\n");
       return false;
     }
 
-- 
cgit v1.1


From cc03331caaeee00b1238654feda8c5a517e48c3a Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 30 Oct 2012 18:12:36 +0000
Subject: LoopVectorize: Fix a bug in the initialization of reduction
 variables. AND needs to start at all-one while XOR, and OR need to start at
 zero.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167032 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0e6103e..ac82a66 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -211,8 +211,6 @@ public:
   TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
 
   /// This represents the kinds of reductions that we support.
-  /// We use the enum values to hold the 'identity' value for
-  /// each operand. This value does not change the result if applied.
   enum ReductionKind {
     NoReduction = -1, /// Not a reduction.
     IntegerAdd  = 0,  /// Sum of numbers.
@@ -523,7 +521,7 @@ SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
   SmallVector<Constant*, 8> Indices;
   // Create a vector of consecutive numbers from zero to VF.
   for (unsigned i = 0; i < VF; ++i)
-    Indices.push_back(ConstantInt::get(ScalarTy, Val));
+    Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
 
   // Add the consecutive indices to the vector value.
   return ConstantVector::get(Indices);
@@ -750,6 +748,23 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopBypassBlock = BypassBlock;
 }
 
+
+static unsigned
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
+  switch (K) {
+  case LoopVectorizationLegality::IntegerXor:
+  case LoopVectorizationLegality::IntegerAdd:
+  case LoopVectorizationLegality::IntegerOr:
+    return 0;
+  case LoopVectorizationLegality::IntegerMult:
+    return 1;
+  case LoopVectorizationLegality::IntegerAnd:
+    return -1;
+  default:
+    llvm_unreachable("Unknown reduction kind");
+  }
+}
+
 void
 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -974,10 +989,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
     Type *VecTy = VectorExit->getType();
 
-    // Find the reduction identity variable. The value of the enum is the
-    // identity. Zero for addition. One for Multiplication.
-    unsigned IdentitySclr =  RdxDesc.Kind;
-    Constant *Identity = getUniformVector(IdentitySclr,
+    // Find the reduction identity variable. Zero for addition, or, xor,
+    // one for multiplication, -1 for And.
+    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
                                           VecTy->getScalarType());
 
     // This vector is the Identity vector where the first element is the
-- 
cgit v1.1


From e709f5b600fd630c4f58b5dba14c8069a03093ea Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 30 Oct 2012 18:36:45 +0000
Subject: LoopVectorize: Add support for write-only loops when the write
 destination is a single pointer. Speedup SciMark by 1%

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167035 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ac82a66..9e05cac 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1348,6 +1348,13 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
       Reads.push_back(Ptr);
   }
 
+  // If we write (or read-write) to a single destination and there are no
+  // other reads in this loop then is it safe to vectorize.
+  if (ReadWrites.size() == 1 && Reads.size() == 0) {
+    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
+    return true;
+  }
+
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
-- 
cgit v1.1


From a368b89f2842530d07c0ac8e3b533882e165f197 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 30 Oct 2012 22:06:26 +0000
Subject: Add documentation.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167055 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9e05cac..431a847 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -749,16 +749,21 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 }
 
 
+/// This function returns the identity element (or neutral element) for
+/// the operation K.
 static unsigned
 getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
   switch (K) {
   case LoopVectorizationLegality::IntegerXor:
   case LoopVectorizationLegality::IntegerAdd:
   case LoopVectorizationLegality::IntegerOr:
+    // Adding, Xoring, Oring zero to a number does not change it.
     return 0;
   case LoopVectorizationLegality::IntegerMult:
+    // Multiplying a number by 1 does not change it.
     return 1;
   case LoopVectorizationLegality::IntegerAnd:
+    // AND-ing a number with an all-1 value does not change it.
     return -1;
   default:
     llvm_unreachable("Unknown reduction kind");
-- 
cgit v1.1


From 462d1ca42831df53a3c6435d247776f0b85bd594 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 00:45:26 +0000
Subject: Add support for loops that don't start with Zero. This is important
 for loops in the LAPACK test-suite. These loops start at 1 because they are
 auto-converted from fortran.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167084 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 31 ++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 431a847..e82dfa2 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -633,6 +633,10 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
+  // The loop index does not have to start at Zero. It starts with this value.
+  OldInduction = Legal->getInduction();
+  Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
+
   assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
 
@@ -648,7 +652,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                  "scalar.preheader");
   // Find the induction variable.
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
-  OldInduction = Legal->getInduction();
   assert(OldInduction && "We must have a single phi node.");
   Type *IdxTy = OldInduction->getType();
 
@@ -658,7 +661,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Generate the induction variable.
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
-  Constant *Zero = ConstantInt::get(IdxTy, 0);
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
   // Find the loop boundaries.
@@ -682,15 +684,22 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Count holds the overall loop count (N).
   Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+
+  // Add the start index to the loop count to get the new end index.
+  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
+
   // Now we need to generate the expression for N - (N % VF), which is
   // the part that the vectorized body will execute.
   Constant *CIVF = ConstantInt::get(IdxTy, VF);
   Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
   Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
+                                                     "end.idx.rnd.down", Loc);
 
   // Now, compare the new count to zero. If it is zero, jump to the scalar part.
   Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                               CountRoundDown, ConstantInt::getNullValue(IdxTy),
+                               IdxEndRoundDown,
+                               StartIdx,
                                "cmp.zero", Loc);
   BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
   // Remove the old terminator.
@@ -699,8 +708,8 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                                CountRoundDown, "cmp.n",
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
+                                IdxEndRoundDown, "cmp.n",
                                 MiddleBlock->getTerminator());
 
   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
@@ -709,10 +718,10 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Create i+1 and fill the PHINode.
   Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
-  Induction->addIncoming(Zero, VectorPH);
+  Induction->addIncoming(StartIdx, VectorPH);
   Induction->addIncoming(NextIdx, VecBody);
   // Create the compare.
-  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
   Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
 
   // Now we have two terminators. Remove the old one from the block.
@@ -720,7 +729,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Fix the scalar body iteration count.
   unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
+  OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown);
 
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
@@ -748,7 +757,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopBypassBlock = BypassBlock;
 }
 
-
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
 static unsigned
@@ -1518,10 +1526,9 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
     return false;
   }
   const SCEV *Step = AR->getStepRecurrence(*SE);
-  const SCEV *Start = AR->getStart();
 
-  if (!Step->isOne() || !Start->isZero()) {
-    DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
+  if (!Step->isOne()) {
+    DEBUG(dbgs() << "LV: PHI stride does not equal one.\n");
     return false;
   }
   return true;
-- 
cgit v1.1


From 5b2c4dc5f8f70352f029d595797089821eb39b3c Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 03:31:07 +0000
Subject: LoopVectorize: Do not vectorize loops with tiny constant trip counts.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167101 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e82dfa2..fd728bd 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1145,6 +1145,14 @@ bool LoopVectorizationLegality::canVectorize() {
     return false;
   }
 
+  // Do not loop-vectorize loops with a tiny trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
+  if (TC > 0 && TC < 16) {
+    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
+          "This loop is not worth vectorizing.\n");
+    return false;
+  }
+
   DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
 
   // Okay! We can vectorize. At this point we don't have any other mem analysis
-- 
cgit v1.1


From 2a5401cc75acf38092a7224cadaf895281d41b29 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 16:14:06 +0000
Subject: Remove enum values since they are not used anymore.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167131 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index fd728bd..40235ef 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -212,12 +212,12 @@ public:
 
   /// This represents the kinds of reductions that we support.
   enum ReductionKind {
-    NoReduction = -1, /// Not a reduction.
-    IntegerAdd  = 0,  /// Sum of numbers.
-    IntegerMult = 1,  /// Product of numbers.
-    IntegerOr   = 2,  /// Bitwise or logical OR of numbers.
-    IntegerAnd  = 3,  /// Bitwise or logical AND of numbers.
-    IntegerXor  = 4   /// Bitwise or logical XOR of numbers.
+    NoReduction, /// Not a reduction.
+    IntegerAdd,  /// Sum of numbers.
+    IntegerMult, /// Product of numbers.
+    IntegerOr,   /// Bitwise or logical OR of numbers.
+    IntegerAnd,  /// Bitwise or logical AND of numbers.
+    IntegerXor   /// Bitwise or logical XOR of numbers.
   };
 
   /// This POD struct holds information about reduction variables.
-- 
cgit v1.1


From 4c1b4b1fe794437cbb245b11650d9e4001c9605e Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 16:22:16 +0000
Subject: Put the threshold magic number in a variable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167134 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 40235ef..94e56a1 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -75,6 +75,9 @@ static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
           cl::desc("Set the default vectorization width. Zero is autoselect."));
 
+/// We don't vectorize loops with a known constant trip count below this number.
+const int TinyTripCountThreshold = 16;
+
 namespace {
 
 // Forward declarations.
@@ -1147,7 +1150,7 @@ bool LoopVectorizationLegality::canVectorize() {
 
   // Do not loop-vectorize loops with a tiny trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
-  if (TC > 0 && TC < 16) {
+  if (TC > 0 && TC < TinyTripCountThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
           "This loop is not worth vectorizing.\n");
     return false;
-- 
cgit v1.1


From e57b2cbce652d45ed5516e52ad82991bfa03cfd7 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 21:40:39 +0000
Subject: LoopVectorize: Preserve NSW, NUW and IsExact flags.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167174 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 94e56a1..c9871e2 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -849,8 +849,19 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
+
         // Use this vector value for all users of the original instruction.
-        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = V;
+
+        // Update the NSW, NUW and Exact flags.
+        BinaryOperator *VecOp = cast<BinaryOperator>(V);
+        if (isa<OverflowingBinaryOperator>(BinOp)) {
+          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+        }
+        if (isa<PossiblyExactOperator>(VecOp))
+          VecOp->setIsExact(BinOp->isExact());
         break;
       }
       case Instruction::Select: {
-- 
cgit v1.1


From 6e48f0307758096d06d0e87875294c76df81dec1 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Fri, 2 Nov 2012 05:24:00 +0000
Subject: Fix sign compare warning. Patch by Mahesha HS.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167282 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c9871e2..8928087 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -76,7 +76,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
           cl::desc("Set the default vectorization width. Zero is autoselect."));
 
 /// We don't vectorize loops with a known constant trip count below this number.
-const int TinyTripCountThreshold = 16;
+const unsigned TinyTripCountThreshold = 16;
 
 namespace {
 
@@ -1161,7 +1161,7 @@ bool LoopVectorizationLegality::canVectorize() {
 
   // Do not loop-vectorize loops with a tiny trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
-  if (TC > 0 && TC < TinyTripCountThreshold) {
+  if (TC > 0u && TC < TinyTripCountThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
           "This loop is not worth vectorizing.\n");
     return false;
-- 
cgit v1.1


From b1bf1eeede72b8c93505dd80fdf21aed0e205c7d Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 9 Nov 2012 07:09:44 +0000
Subject: Add support for memory runtime check. When we can, we calculate array
 bounds. If the arrays are found to be disjoint then we run the vectorized
 version of the loop. If they are not, we run the scalar code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167608 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 224 ++++++++++++++++++++++++-----
 1 file changed, 191 insertions(+), 33 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8928087..b657993 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -78,6 +78,10 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
 /// We don't vectorize loops with a known constant trip count below this number.
 const unsigned TinyTripCountThreshold = 16;
 
+/// When performing a runtime memory check, do not check more than this
+/// numner of pointers. Notice that the check is quadratic!
+const unsigned RuntimeMemoryCheckThreshold = 2;
+
 namespace {
 
 // Forward declarations.
@@ -242,6 +246,15 @@ public:
     ReductionKind Kind;
   };
 
+  // This POD struct holds information about the memory runtime legality
+  // check that a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<Value*, 2> Pointers;
+  };
+
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
@@ -263,9 +276,14 @@ public:
   /// This check allows us to vectorize A[idx] into a wide load/store.
   bool isConsecutiveGep(Value *Ptr);
 
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
   /// Returns true if this instruction will remain scalar after vectorization.
   bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
 
+  /// Returns the information that we collected about runtime memory check.
+  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -286,6 +304,8 @@ private:
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
   /// Returns True, if 'Phi' is an induction variable.
   bool isInductionVariable(PHINode *Phi);
+  /// Return true if we
+  bool hasComputableBounds(Value *Ptr);
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -306,6 +326,9 @@ private:
   /// This set holds the variables which are known to be uniform after
   /// vectorization.
   SmallPtrSet<Instruction*, 4> Uniforms;
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -506,6 +529,10 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
   return false;
 }
 
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   // If we saved a vectorized copy of V, use it.
@@ -631,13 +658,29 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    ...
    */
 
+  OldInduction = Legal->getInduction();
+  assert(OldInduction && "We must have a single phi node.");
+  Type *IdxTy = OldInduction->getType();
+
+  // Find the loop boundaries.
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+
+  // Get the total trip count from the count by adding 1.
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(ExitCount->getType(), 1));
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  // We are using Zext because it should be less expensive.
+  if (ExitCount->getType() != IdxTy)
+    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+
   // This is the original scalar-loop preheader.
   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
   // The loop index does not have to start at Zero. It starts with this value.
-  OldInduction = Legal->getInduction();
   Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
 
   assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
@@ -655,8 +698,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                  "scalar.preheader");
   // Find the induction variable.
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
-  assert(OldInduction && "We must have a single phi node.");
-  Type *IdxTy = OldInduction->getType();
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
@@ -666,25 +707,11 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
-  // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
-  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
-
-  // Get the total trip count from the count by adding 1.
-  ExitCount = SE->getAddExpr(ExitCount,
-                             SE->getConstant(ExitCount->getType(), 1));
-
   // Expand the trip count and place the new instructions in the preheader.
   // Notice that the pre-header does not change, only the loop body.
   SCEVExpander Exp(*SE, "induction");
   Instruction *Loc = BypassBlock->getTerminator();
 
-  // We may need to extend the index in case there is a type mismatch.
-  // We know that the count starts at zero and does not overflow.
-  // We are using Zext because it should be less expensive.
-  if (ExitCount->getType() != Induction->getType())
-    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
-
   // Count holds the overall loop count (N).
   Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
 
@@ -704,15 +731,85 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                IdxEndRoundDown,
                                StartIdx,
                                "cmp.zero", Loc);
+
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+    Legal->getRuntimePointerCheck();
+  Value *MemoryRuntimeCheck = 0;
+  if (PtrRtCheck->Need) {
+    unsigned NumPointers = PtrRtCheck->Pointers.size();
+    SmallVector<Value* , 2> Starts;
+    SmallVector<Value* , 2> Ends;
+
+    // Use this type for pointer arithmetic.
+    Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+    for (unsigned i=0; i < NumPointers; ++i) {
+      Value *Ptr = PtrRtCheck->Pointers[i];
+      const SCEV *Sc = SE->getSCEV(Ptr);
+
+      if (SE->isLoopInvariant(Sc, OrigLoop)) {
+        DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+              *Ptr <<"\n");
+        Starts.push_back(Ptr);
+        Ends.push_back(Ptr);
+      } else {
+        DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+        Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
+        const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+        const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+        assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range.");
+        Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
+        Starts.push_back(Start);
+        Ends.push_back(End);
+      }
+    }
+
+    for (unsigned i=0; i < NumPointers; ++i) {
+      for (unsigned j=i+1; j < NumPointers; ++j) {
+        Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                      Starts[0], Ends[1], "bound0", Loc);
+        Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                      Starts[1], Ends[0], "bound1", Loc);
+        Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+                                                    "found.conflict", Loc);
+        if (MemoryRuntimeCheck) {
+          MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+                                                      MemoryRuntimeCheck,
+                                                      IsConflict,
+                                                      "conflict.rdx", Loc);
+        } else {
+          MemoryRuntimeCheck = IsConflict;
+        }
+      }
+    }
+  }// end of need-runtime-check code.
+
+  // If we are using memory runtime checks, include them in.
+  if (MemoryRuntimeCheck) {
+    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
+                                 "CntOrMem", Loc);
+  }
+
   BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
   // Remove the old terminator.
   Loc->eraseFromParent();
 
+  // We are going to resume the execution of the scalar loop.
+  // This PHI decides on what number to start. If we come from the
+  // vector loop then we need to start with the end index minus the
+  // index modulo VF. If we come from a bypass edge then we need to start
+  // from the real start.
+  PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx",
+                                         MiddleBlock->getTerminator());
+  ResumeIndex->addIncoming(StartIdx, BypassBlock);
+  ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
-                                IdxEndRoundDown, "cmp.n",
+                                ResumeIndex, "cmp.n",
                                 MiddleBlock->getTerminator());
 
   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
@@ -732,7 +829,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Fix the scalar body iteration count.
   unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown);
+  OldInduction->setIncomingValue(BlockIdx, ResumeIndex);
 
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
@@ -905,7 +1002,12 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
         Value *Ptr = SI->getPointerOperand();
         unsigned Alignment = SI->getAlignment();
+
+        assert(!Legal->isUniform(Ptr) &&
+               "We do not allow storing to uniform addresses");
+
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
         // This store does not use GEPs.
         if (!Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
@@ -935,8 +1037,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         unsigned Alignment = LI->getAlignment();
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
-        // We don't have a gep. Scalarize the load.
-        if (!Legal->isConsecutiveGep(Gep)) {
+        // If we don't have a gep, or that the pointer is loop invariant,
+        // scalarize the load.
+        if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
           break;
         }
@@ -1146,12 +1249,6 @@ bool LoopVectorizationLegality::canVectorize() {
   BasicBlock *BB = TheLoop->getHeader();
   DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
 
-  // Go over each instruction and look at memory deps.
-  if (!canVectorizeBlock(*BB)) {
-    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
-    return false;
-  }
-
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
   if (ExitCount == SE->getCouldNotCompute()) {
@@ -1167,7 +1264,15 @@ bool LoopVectorizationLegality::canVectorize() {
     return false;
   }
 
-  DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeBlock(*BB)) {
+    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+        <<"!\n");
 
   // Okay! We can vectorize. At this point we don't have any other mem analysis
   // which may limit our maximum vectorization factor, so just return true with
@@ -1304,6 +1409,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   // Holds the Load and Store *instructions*.
   ValueVector Loads;
   ValueVector Stores;
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
 
   // Scan the BB and collect legal loads and stores.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -1361,6 +1468,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     StoreInst *ST = dyn_cast<StoreInst>(*I);
     assert(ST && "Bad StoreInst");
     Value* Ptr = ST->getPointerOperand();
+
+    if (isUniform(Ptr)) {
+      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+      return false;
+    }
+
     // If we did *not* see this pointer before, insert it to
     // the read-write list. At this phase it is only a 'write' list.
     if (Seen.insert(Ptr))
@@ -1390,6 +1503,39 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     return true;
   }
 
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  bool RT = true;
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.Pointers.push_back(*I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.Pointers.push_back(*I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+
+  // Check that we did not collect too many pointers or found a
+  // unsizeable pointer.
+  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.Pointers.clear();
+    RT = false;
+  }
+
+  PtrRtCheck.Need = RT;
+
+  if (RT) {
+    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+  }
+
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
@@ -1404,12 +1550,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
-        return false;
+        return RT;
       }
       if (!WriteObjects.insert(*it)) {
         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
               << **it <<"\n");
-        return false;
+        return RT;
       }
     }
     TempObjects.clear();
@@ -1422,18 +1568,21 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
-        return false;
+        return RT;
       }
       if (WriteObjects.count(*it)) {
         DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
               << **it <<"\n");
-        return false;
+        return RT;
       }
     }
     TempObjects.clear();
   }
 
-  // All is okay.
+  // It is safe to vectorize and we don't need any runtime checks.
+  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
   return true;
 }
 
@@ -1556,6 +1705,15 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   return true;
 }
 
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+  const SCEV *PhiScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR)
+    return false;
+
+  return AR->isAffine();
+}
+
 unsigned
 LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
   if (!VTTI) {
-- 
cgit v1.1


From 9005b8d27d68caeb893d74aa75cd679972d86cef Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 11 Nov 2012 05:15:00 +0000
Subject: Fix a comment typo and add comments.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167684 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index b657993..a7ef248 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -79,7 +79,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
 const unsigned TinyTripCountThreshold = 16;
 
 /// When performing a runtime memory check, do not check more than this
-/// numner of pointers. Notice that the check is quadratic!
+/// number of pointers. Notice that the check is quadratic!
 const unsigned RuntimeMemoryCheckThreshold = 2;
 
 namespace {
@@ -118,7 +118,7 @@ public:
     /// Widen each instruction in the old loop to a new one in the new loop.
     /// Use the Legality module to find the induction and reduction variables.
     vectorizeLoop(Legal);
-    // register the new loop.
+    // Register the new loop and update the analysis passes.
     updateAnalysis();
  }
 
@@ -127,7 +127,8 @@ private:
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop(LoopVectorizationLegality *Legal);
-  /// Insert the new loop to the loop hierarchy and pass manager.
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
   void updateAnalysis();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
@@ -304,7 +305,7 @@ private:
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
   /// Returns True, if 'Phi' is an induction variable.
   bool isInductionVariable(PHINode *Phi);
-  /// Return true if we
+  /// Return true if can compute the address bounds of Ptr within the loop.
   bool hasComputableBounds(Value *Ptr);
 
   /// The loop that we evaluate.
-- 
cgit v1.1


From e123fd9c5956962d414693ee8ad83498413af6ce Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 14 Nov 2012 00:02:13 +0000
Subject: use the getSplat API. Patch by Paul Redmond.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167892 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a7ef248..12e4db6 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -549,13 +549,7 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
 
 Constant*
 SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
-  SmallVector<Constant*, 8> Indices;
-  // Create a vector of consecutive numbers from zero to VF.
-  for (unsigned i = 0; i < VF; ++i)
-    Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
-
-  // Add the consecutive indices to the vector value.
-  return ConstantVector::get(Indices);
+  return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
 }
 
 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
-- 
cgit v1.1


From 77d6ae1b01f86cc82593c8f67118b7c2d6fd17c2 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 16 Nov 2012 06:51:17 +0000
Subject: LoopVectorize: Division reductions generate incorrect code. Remove
 the part of the code that deals with divs. Thanks to Paul Redmond for
 catching this while reviewing the code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168142 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 --
 1 file changed, 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 12e4db6..31e0e86 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1671,8 +1671,6 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     case Instruction::Sub:
       return Kind == IntegerAdd;
     case Instruction::Mul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
       return Kind == IntegerMult;
     case Instruction::And:
       return Kind == IntegerAnd;
-- 
cgit v1.1


From 9a6823516ffd6ed1787d923459c80a6fa4833914 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 17 Nov 2012 00:27:03 +0000
Subject: LoopVectorizer: Add initial support for pointer induction variables
 (for example: *dst++ = *src++). At the moment we still require to have an
 integer induction variable (for example: i++).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168231 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 159 +++++++++++++++++++++++------
 1 file changed, 126 insertions(+), 33 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 31e0e86..3f1d82c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -260,6 +260,10 @@ public:
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
+  /// InductionList saves induction variables and maps them to the initial
+  /// value entring the loop.
+  typedef DenseMap<PHINode*, Value*> InductionList;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -271,6 +275,9 @@ public:
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
 
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
   /// Check if the pointer returned by this GEP is consecutive
   /// when the index is vectorized. This happens when the last
   /// index of the GEP is consecutive, like the induction variable.
@@ -317,10 +324,16 @@ private:
 
   //  ---  vectorization state --- //
 
-  /// Holds the induction variable.
+  /// Holds the integer induction variable. This is the counter of the
+  /// loop.
   PHINode *Induction;
   /// Holds the reduction variables.
   ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
   /// Allowed outside users. This holds the reduction
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value*, 4> AllowedExit;
@@ -791,14 +804,50 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   Loc->eraseFromParent();
 
   // We are going to resume the execution of the scalar loop.
-  // This PHI decides on what number to start. If we come from the
-  // vector loop then we need to start with the end index minus the
-  // index modulo VF. If we come from a bypass edge then we need to start
-  // from the real start.
-  PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx",
-                                         MiddleBlock->getTerminator());
-  ResumeIndex->addIncoming(StartIdx, BypassBlock);
-  ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop.
+  // If we come from a bypass edge then we need to start from the original start
+  // value.
+
+  // This variable saves the new starting index for the scalar loop.
+  Value *ResumeIndex = 0;
+  LoopVectorizationLegality::InductionList::iterator I, E;
+  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  for (I = List->begin(), E = List->end(); I != E; ++I) {
+    PHINode *OrigPhi = I->first;
+    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+                                           MiddleBlock->getTerminator());
+    Value *EndValue = 0;
+    if (OrigPhi->getType()->isIntegerTy()) {
+      // Handle the integer induction counter:
+      assert(OrigPhi == OldInduction && "Unknown integer PHI");
+      // We know what the end value is.
+      EndValue = IdxEndRoundDown;
+      // We also know which PHI node holds it.
+      ResumeIndex = ResumeVal;
+    } else {
+      // For pointer induction variables, calculate the offset using
+      // the end index.
+      EndValue = GetElementPtrInst::Create(I->second, IdxEndRoundDown,
+                                           "ptr.ind.end",
+                                           BypassBlock->getTerminator());
+    }
+
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    ResumeVal->addIncoming(I->second, BypassBlock);
+    ResumeVal->addIncoming(EndValue, VecBody);
+
+    // Fix the scalar body counter (PHI node).
+    unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
+    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+  }
+
+  // Make sure that we found the index where scalar loop needs to continue.
+  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
+         "Invalid resume Index");
 
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
@@ -822,10 +871,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Now we have two terminators. Remove the old one from the block.
   VecBody->getTerminator()->eraseFromParent();
 
-  // Fix the scalar body iteration count.
-  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, ResumeIndex);
-
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
@@ -895,7 +940,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // add the new incoming edges to the PHI. At this point all of the
   // instructions in the basic block are vectorized, so we can use them to
   // construct the PHI.
-  PhiVector PHIsToFix;
+  PhiVector RdxPHIsToFix;
 
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -911,13 +956,40 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // Special handling for the induction var.
         if (OldInduction == Inst)
           continue;
-        // This is phase one of vectorizing PHIs.
-        // This has to be a reduction variable.
-        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
-        Type *VecTy = VectorType::get(Inst->getType(), VF);
-        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
-        PHIsToFix.push_back(P);
-        continue;
+
+        // Handle reduction variables:
+        if (Legal->getReductionVars()->count(P)) {
+          // This is phase one of vectorizing PHIs.
+          Type *VecTy = VectorType::get(Inst->getType(), VF);
+          WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
+          RdxPHIsToFix.push_back(P);
+          continue;
+        }
+
+        // Handle pointer inductions:
+        if (Legal->getInductionVars()->count(P)) {
+          Value *StartIdx = Legal->getInductionVars()->lookup(OldInduction);
+          Value *StartPtr = Legal->getInductionVars()->lookup(P);
+          // This is the normalized GEP that starts counting at zero.
+          Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                   "normalized.idx");
+          // This is the first GEP in the sequence.
+          Value *FirstGep = Builder.CreateGEP(StartPtr, NormalizedIdx,
+                                              "induc.ptr");
+          // This is the vector of results. Notice that we don't generate vector
+          // geps because scalar geps result in better code.
+          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+          for (unsigned int i = 0; i < VF; ++i) {
+            Value *SclrGep = Builder.CreateGEP(FirstGep, Builder.getInt32(i),
+                                               "next.gep");
+            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                                 Builder.getInt32(i),
+                                                 "insert.gep");
+          }
+
+          WidenMap[Inst] = VecVal;
+          continue;
+        }
       }
       case Instruction::Add:
       case Instruction::FAdd:
@@ -1092,7 +1164,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // Create the 'reduced' values for each of the induction vars.
   // The reduced values are the vector values that we scalarize and combine
   // after the loop is finished.
-  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
        it != e; ++it) {
     PHINode *RdxPhi = *it;
     PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
@@ -1124,7 +1196,6 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *VectorStart = Builder.CreateInsertElement(Identity,
                                                     RdxDesc.StartValue, Zero);
 
-
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
     // preheader is the first entry.
@@ -1276,23 +1347,33 @@ bool LoopVectorizationLegality::canVectorize() {
 }
 
 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+
   // Scan the instructions in the block and look for hazards.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *I = it;
 
-    PHINode *Phi = dyn_cast<PHINode>(I);
-    if (Phi) {
+    if (PHINode *Phi = dyn_cast<PHINode>(I)) {
       // This should not happen because the loop should be normalized.
       if (Phi->getNumIncomingValues() != 2) {
         DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
         return false;
       }
-      // We only look at integer phi nodes.
-      if (!Phi->getType()->isIntegerTy()) {
-        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+
+      // This is the value coming from the preheader.
+      Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+
+      // We only look at integer and pointer phi nodes.
+      if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
+        DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
+        Inductions[Phi] = StartValue;
+        continue;
+      } else if (!Phi->getType()->isIntegerTy()) {
+        DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
         return false;
       }
 
+      // Handle integer PHIs:
       if (isInductionVariable(Phi)) {
         if (Induction) {
           DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
@@ -1300,6 +1381,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         }
         DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
         Induction = Phi;
+        Inductions[Phi] = StartValue;
         continue;
       }
       if (AddReductionVar(Phi, IntegerAdd)) {
@@ -1682,6 +1764,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
 }
 
 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return false;
+
   // Check that the PHI is consecutive and starts at zero.
   const SCEV *PhiScev = SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
@@ -1691,11 +1778,17 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   }
   const SCEV *Step = AR->getStepRecurrence(*SE);
 
-  if (!Step->isOne()) {
-    DEBUG(dbgs() << "LV: PHI stride does not equal one.\n");
-    return false;
-  }
-  return true;
+  // Integer inductions need to have a stride of one.
+  if (PhiTy->isIntegerTy())
+    return Step->isOne();
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C) return false;
+
+  assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+  uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
+  return (C->getValue()->equalsInt(Size));
 }
 
 bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
-- 
cgit v1.1


From 0af63ac245eeb0cce206ed4dbc9abdc0d86742cb Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 25 Nov 2012 08:41:35 +0000
Subject: Add support for pointer induction variables even when there is no
 integer induction variable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168558 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 287 ++++++++++++++++++-----------
 1 file changed, 182 insertions(+), 105 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3f1d82c..f906432 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -106,9 +106,10 @@ class SingleBlockLoopVectorizer {
 public:
   /// Ctor.
   SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                            DominatorTree *dt, LPPassManager *Lpm,
+                            DominatorTree *dt, DataLayout *dl,
+                            LPPassManager *Lpm,
                             unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
+  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), DL(dl), LPM(Lpm), VF(VecWidth),
   Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
@@ -167,6 +168,8 @@ private:
   LoopInfo *LI;
   // Dominator Tree.
   DominatorTree *DT;
+  // Data Layout;
+  DataLayout *DL;
   // Loop Pass Manager;
   LPPassManager *LPM;
   // The vectorization factor to use.
@@ -250,10 +253,36 @@ public:
   // This POD struct holds information about the memory runtime legality
   // check that a group of pointers do not overlap.
   struct RuntimePointerCheck {
+    RuntimePointerCheck(): Need(false) {}
+
+    /// Reset the state of the pointer runtime information.
+    void reset() {
+      Need = false;
+      Pointers.clear();
+      Starts.clear();
+      Ends.clear();
+    }
+
+    /// Insert a pointer and calculate the start and end SCEVs.
+    void insert_pointer(ScalarEvolution *SE, Loop *Lp, Value *Ptr) {
+      const SCEV *Sc = SE->getSCEV(Ptr);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+      assert(AR && "Invalid addrec expression");
+      const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader());
+      const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+      Pointers.push_back(Ptr);
+      Starts.push_back(AR->getStart());
+      Ends.push_back(ScEnd);
+    }
+
     /// This flag indicates if we need to add the runtime check.
     bool Need;
     /// Holds the pointers that we need to check.
     SmallVector<Value*, 2> Pointers;
+    /// Holds the pointer value at the beginning of the loop.
+    SmallVector<const SCEV*, 2> Starts;
+    /// Holds the pointer value at the end of the loop.
+    SmallVector<const SCEV*, 2> Ends;
   };
 
   /// ReductionList contains the reduction descriptors for all
@@ -278,11 +307,11 @@ public:
   /// Returns the induction variables found in the loop.
   InductionList *getInductionVars() { return &Inductions; }
 
-  /// Check if the pointer returned by this GEP is consecutive
-  /// when the index is vectorized. This happens when the last
-  /// index of the GEP is consecutive, like the induction variable.
+  /// Check if this  pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
   /// This check allows us to vectorize A[idx] into a wide load/store.
-  bool isConsecutiveGep(Value *Ptr);
+  bool isConsecutivePtr(Value *Ptr);
 
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
@@ -451,7 +480,7 @@ struct LoopVectorize : public LoopPass {
           "\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
+    SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, &LPM, VF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -472,10 +501,6 @@ struct LoopVectorize : public LoopPass {
 };
 
 Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Instructions that access the old induction variable
-  // actually want to get the new one.
-  if (V == OldInduction)
-    V = Induction;
   // Create the types.
   LLVMContext &C = V->getContext();
   Type *VTy = VectorType::get(V->getType(), VF);
@@ -515,7 +540,14 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
   return Builder.CreateAdd(Val, Cv, "induction");
 }
 
-bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+  assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+
+  // If this pointer is an induction variable, return it.
+  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
+  if (Phi && getInductionVars()->count(Phi))
+    return true;
+
   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
   if (!Gep)
     return false;
@@ -576,7 +608,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
     // If we are accessing the old induction variable, use the new one.
     if (SrcOp == OldInduction) {
-      Params.push_back(getBroadcastInstrs(Induction));
+      Params.push_back(getVectorValue(Induction));
       continue;
     }
 
@@ -666,9 +698,13 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    ...
    */
 
+  // Some loops have a single integer induction variable, while other loops
+  // don't. One example is c++ iterators that often have multiple pointer
+  // induction variables. In the code below we also support a case where we
+  // don't have a single induction variable.
   OldInduction = Legal->getInduction();
-  assert(OldInduction && "We must have a single phi node.");
-  Type *IdxTy = OldInduction->getType();
+  Type *IdxTy = OldInduction ? OldInduction->getType() :
+    DL->getIntPtrType(SE->getContext());
 
   // Find the loop boundaries.
   const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
@@ -677,19 +713,18 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Get the total trip count from the count by adding 1.
   ExitCount = SE->getAddExpr(ExitCount,
                              SE->getConstant(ExitCount->getType(), 1));
-  // We may need to extend the index in case there is a type mismatch.
-  // We know that the count starts at zero and does not overflow.
-  // We are using Zext because it should be less expensive.
-  if (ExitCount->getType() != IdxTy)
-    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
 
   // This is the original scalar-loop preheader.
   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
-  // The loop index does not have to start at Zero. It starts with this value.
-  Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
+  // The loop index does not have to start at Zero. Find the original start
+  // value from the induction PHI node. If we don't have an induction variable
+  // then we know that it starts at zero.
+  Value *StartIdx = OldInduction ?
+    OldInduction->getIncomingValueForBlock(BypassBlock):
+    ConstantInt::get(IdxTy, 0);
 
   assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
@@ -721,7 +756,18 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   Instruction *Loc = BypassBlock->getTerminator();
 
   // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), Loc);
+
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  if (Count->getType() != IdxTy) {
+    // The exit count can be of pointer type. Convert it to the correct
+    // integer type.
+    if (ExitCount->getType()->isPointerTy())
+      Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
+    else
+      Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+  }
 
   // Add the start index to the loop count to get the new end index.
   Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
@@ -734,7 +780,8 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
                                                      "end.idx.rnd.down", Loc);
 
-  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop.
   Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
                                IdxEndRoundDown,
                                StartIdx,
@@ -762,23 +809,21 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
         Ends.push_back(Ptr);
       } else {
         DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
-        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
-        Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
-        const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
-        const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
-        assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range.");
-        Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
+
+        Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i],
+                                         PtrArithTy, Loc);
+        Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
         Starts.push_back(Start);
         Ends.push_back(End);
       }
     }
 
-    for (unsigned i=0; i < NumPointers; ++i) {
-      for (unsigned j=i+1; j < NumPointers; ++j) {
+    for (unsigned i = 0; i < NumPointers; ++i) {
+      for (unsigned j = i+1; j < NumPointers; ++j) {
         Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                      Starts[0], Ends[1], "bound0", Loc);
+                                      Starts[i], Ends[j], "bound0", Loc);
         Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                      Starts[1], Ends[0], "bound1", Loc);
+                                      Starts[j], Ends[i], "bound1", Loc);
         Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
                                                     "found.conflict", Loc);
         if (MemoryRuntimeCheck) {
@@ -812,7 +857,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // value.
 
   // This variable saves the new starting index for the scalar loop.
-  Value *ResumeIndex = 0;
+  PHINode *ResumeIndex = 0;
   LoopVectorizationLegality::InductionList::iterator I, E;
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
   for (I = List->begin(), E = List->end(); I != E; ++I) {
@@ -830,7 +875,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
     } else {
       // For pointer induction variables, calculate the offset using
       // the end index.
-      EndValue = GetElementPtrInst::Create(I->second, IdxEndRoundDown,
+      EndValue = GetElementPtrInst::Create(I->second, CountRoundDown,
                                            "ptr.ind.end",
                                            BypassBlock->getTerminator());
     }
@@ -841,10 +886,22 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
     ResumeVal->addIncoming(EndValue, VecBody);
 
     // Fix the scalar body counter (PHI node).
-    unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
+    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
     OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
   }
 
+  // If we are generating a new induction variable then we also need to
+  // generate the code that calculates the exit value. This value is not
+  // simply the end of the counter because we may skip the vectorized body
+  // in case of a runtime check.
+  if (!OldInduction){
+    assert(!ResumeIndex && "Unexpected resume value found");
+    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
+                                  MiddleBlock->getTerminator());
+    ResumeIndex->addIncoming(StartIdx, BypassBlock);
+    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+  }
+
   // Make sure that we found the index where scalar loop needs to continue.
   assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
          "Invalid resume Index");
@@ -953,43 +1010,54 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         continue;
       case Instruction::PHI:{
         PHINode* P = cast<PHINode>(Inst);
-        // Special handling for the induction var.
-        if (OldInduction == Inst)
-          continue;
-
         // Handle reduction variables:
         if (Legal->getReductionVars()->count(P)) {
           // This is phase one of vectorizing PHIs.
           Type *VecTy = VectorType::get(Inst->getType(), VF);
-          WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
+          WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi",
+                                  LoopVectorBody->getFirstInsertionPt());
           RdxPHIsToFix.push_back(P);
           continue;
         }
 
-        // Handle pointer inductions:
-        if (Legal->getInductionVars()->count(P)) {
-          Value *StartIdx = Legal->getInductionVars()->lookup(OldInduction);
-          Value *StartPtr = Legal->getInductionVars()->lookup(P);
-          // This is the normalized GEP that starts counting at zero.
-          Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
-                                                   "normalized.idx");
-          // This is the first GEP in the sequence.
-          Value *FirstGep = Builder.CreateGEP(StartPtr, NormalizedIdx,
-                                              "induc.ptr");
-          // This is the vector of results. Notice that we don't generate vector
-          // geps because scalar geps result in better code.
-          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
-          for (unsigned int i = 0; i < VF; ++i) {
-            Value *SclrGep = Builder.CreateGEP(FirstGep, Builder.getInt32(i),
-                                               "next.gep");
-            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
-                                                 Builder.getInt32(i),
-                                                 "insert.gep");
-          }
-
-          WidenMap[Inst] = VecVal;
+        // This PHINode must be an induction variable.
+        // Make sure that we know about it.
+        assert(Legal->getInductionVars()->count(P) &&
+               "Not an induction variable");
+
+        if (P->getType()->isIntegerTy()) {
+          assert(P == OldInduction && "Unexpected PHI");
+          WidenMap[Inst] = getBroadcastInstrs(Induction);
           continue;
         }
+
+        // Handle pointer inductions:
+        assert(P->getType()->isPointerTy() && "Unexpected type.");
+        Value *StartIdx = OldInduction ?
+          Legal->getInductionVars()->lookup(OldInduction) :
+          ConstantInt::get(Induction->getType(), 0);
+
+        // This is the pointer value coming into the loop.
+        Value *StartPtr = Legal->getInductionVars()->lookup(P);
+
+        // This is the normalized GEP that starts counting at zero.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                 "normalized.idx");
+
+        // This is the vector of results. Notice that we don't generate vector
+        // geps because scalar geps result in better code.
+        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+        for (unsigned int i = 0; i < VF; ++i) {
+          Constant *Idx = ConstantInt::get(Induction->getType(), i);
+          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+          Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep");
+          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                               Builder.getInt32(i),
+                                               "insert.gep");
+        }
+
+        WidenMap[Inst] = VecVal;
+        continue;
       }
       case Instruction::Add:
       case Instruction::FAdd:
@@ -1076,21 +1144,27 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
         // This store does not use GEPs.
-        if (!Legal->isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutivePtr(Ptr)) {
           scalarizeInstruction(Inst);
           break;
         }
 
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
+        if (Gep) {
+          // The last index does not have to be the induction. It can be
+          // consecutive and be a function of the index. For example A[I+1];
+          unsigned NumOperands = Gep->getNumOperands();
+          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+          // Create the new GEP with the new induction variable.
+          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+          Gep2->setOperand(NumOperands - 1, LastIndex);
+          Ptr = Builder.Insert(Gep2);
+        } else {
+          // Use the induction element ptr.
+          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        }
         Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
         Value *Val = getVectorValue(SI->getValueOperand());
         Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
@@ -1104,23 +1178,31 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         unsigned Alignment = LI->getAlignment();
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
-        // If we don't have a gep, or that the pointer is loop invariant,
+        // If the pointer is loop invariant or if it is non consecutive,
         // scalarize the load.
-        if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) {
+        bool Con = Legal->isConsecutivePtr(Ptr);
+        if (Legal->isUniform(Ptr) || !Con) {
           scalarizeInstruction(Inst);
           break;
         }
 
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+        if (Gep) {
+          // The last index does not have to be the induction. It can be
+          // consecutive and be a function of the index. For example A[I+1];
+          unsigned NumOperands = Gep->getNumOperands();
+          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+          // Create the new GEP with the new induction variable.
+          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+          Gep2->setOperand(NumOperands - 1, LastIndex);
+          Ptr = Builder.Insert(Gep2);
+        } else {
+          // Use the induction element ptr.
+          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        }
 
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
         Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
         LI = Builder.CreateLoad(Ptr);
         LI->setAlignment(Alignment);
@@ -1301,7 +1383,7 @@ bool LoopVectorizationLegality::canVectorize() {
   if (!TheLoop->getLoopPreheader()) {
     assert(false && "No preheader!!");
     DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
-    return  false;
+    return false;
   }
 
   // We can only vectorize single basic block loops.
@@ -1347,6 +1429,7 @@ bool LoopVectorizationLegality::canVectorize() {
 }
 
 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
 
   // Scan the instructions in the block and look for hazards.
@@ -1440,8 +1523,8 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
   } // next instr.
 
   if (!Induction) {
-      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
-      return false;
+    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    assert(getInductionVars()->size() && "No induction variables");
   }
 
   // Don't vectorize if the memory dependencies do not allow vectorization.
@@ -1458,15 +1541,10 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
   while (Worklist.size()) {
     Instruction *I = dyn_cast<Instruction>(Worklist.back());
     Worklist.pop_back();
-    // Look at instructions inside this block.
-    if (!I) continue;
-    if (I->getParent() != &BB) continue;
 
-    // Stop when reaching PHI nodes.
-    if (isa<PHINode>(I)) {
-      assert(I == Induction && "Found a uniform PHI that is not the induction");
-      break;
-    }
+    // Look at instructions inside this block. Stop when reaching PHI nodes.
+    if (!I || I->getParent() != &BB || isa<PHINode>(I))
+      continue;
 
     // This is a known uniform.
     Uniforms.insert(I);
@@ -1569,7 +1647,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     // If the address of i is unknown (for example A[B[i]]) then we may
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
-    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+    if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr))
       Reads.push_back(Ptr);
   }
 
@@ -1585,7 +1663,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   bool RT = true;
   for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
     if (hasComputableBounds(*I)) {
-      PtrRtCheck.Pointers.push_back(*I);
+      PtrRtCheck.insert_pointer(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
       RT = false;
@@ -1593,7 +1671,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     }
   for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
     if (hasComputableBounds(*I)) {
-      PtrRtCheck.Pointers.push_back(*I);
+      PtrRtCheck.insert_pointer(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
       RT = false;
@@ -1603,7 +1681,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
   if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
-    PtrRtCheck.Pointers.clear();
+    PtrRtCheck.reset();
     RT = false;
   }
 
@@ -1658,8 +1736,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
 
   // It is safe to vectorize and we don't need any runtime checks.
   DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
-  PtrRtCheck.Pointers.clear();
-  PtrRtCheck.Need = false;
+  PtrRtCheck.reset();
   return true;
 }
 
@@ -1917,7 +1994,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                               SI->getAlignment(), SI->getPointerAddressSpace());
 
       // Scalarized stores.
-      if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
+      if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
         unsigned Cost = 0;
         unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
                                               ValTy);
@@ -1944,7 +2021,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                                      LI->getPointerAddressSpace());
 
       // Scalarized loads.
-      if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
+      if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
         unsigned Cost = 0;
         unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
         // The cost of inserting the loaded value into the result vector.
-- 
cgit v1.1


From d7e0b7cdc59d04bec6cf5273cde3d2a16b13800e Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 25 Nov 2012 09:09:26 +0000
Subject: The induction-pointer work is inspired by a research paper. This
 commit adds a reference.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168559 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f906432..84516eb 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -25,6 +25,7 @@
 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
 //    of vectorization. It decides on the optimal vector width, which
 //    can be one, if vectorization is not profitable.
+//
 //===----------------------------------------------------------------------===//
 //
 // The reduction-variable vectorization is based on the paper:
@@ -36,6 +37,9 @@
 // Other ideas/concepts are from:
 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
 //
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
 //===----------------------------------------------------------------------===//
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
-- 
cgit v1.1


From a8e2b2b68fed9883bd41335f57f1193ffcc22ed2 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 25 Nov 2012 09:13:57 +0000
Subject: Rename method. No functionality change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168560 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 84516eb..8ed4caf 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -268,7 +268,7 @@ public:
     }
 
     /// Insert a pointer and calculate the start and end SCEVs.
-    void insert_pointer(ScalarEvolution *SE, Loop *Lp, Value *Ptr) {
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) {
       const SCEV *Sc = SE->getSCEV(Ptr);
       const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
       assert(AR && "Invalid addrec expression");
@@ -1667,7 +1667,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   bool RT = true;
   for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
     if (hasComputableBounds(*I)) {
-      PtrRtCheck.insert_pointer(SE, TheLoop, *I);
+      PtrRtCheck.insert(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
       RT = false;
@@ -1675,7 +1675,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     }
   for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
     if (hasComputableBounds(*I)) {
-      PtrRtCheck.insert_pointer(SE, TheLoop, *I);
+      PtrRtCheck.insert(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
       RT = false;
-- 
cgit v1.1


From 8c6b73666bdd08f15b31c00bd2fd663b632a1d65 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 25 Nov 2012 16:27:16 +0000
Subject: Refactor the ptr runtime check generation code. No functionality
 change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168568 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 116 ++++++++++++++++-------------
 1 file changed, 66 insertions(+), 50 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8ed4caf..2ca5fea 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -128,6 +128,10 @@ public:
  }
 
 private:
+  /// Add code that checks at runtime if the accessed arrays overlap.
+  /// Returns the comperator value or NULL if no check is needed.
+  Value*  addRuntimeCheck(LoopVectorizationLegality *Legal,
+                          Instruction *Loc);
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
@@ -671,6 +675,67 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
     WidenMap[Instr] = VecResults;
 }
 
+Value*
+SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+                                           Instruction *Loc) {
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+    Legal->getRuntimePointerCheck();
+
+  if (!PtrRtCheck->Need)
+    return NULL;
+
+  Value *MemoryRuntimeCheck = 0;
+  unsigned NumPointers = PtrRtCheck->Pointers.size();
+  SmallVector<Value* , 2> Starts;
+  SmallVector<Value* , 2> Ends;
+
+  SCEVExpander Exp(*SE, "induction");
+
+  // Use this type for pointer arithmetic.
+  Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+  for (unsigned i=0; i < NumPointers; ++i) {
+    Value *Ptr = PtrRtCheck->Pointers[i];
+    const SCEV *Sc = SE->getSCEV(Ptr);
+
+    if (SE->isLoopInvariant(Sc, OrigLoop)) {
+      DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+            *Ptr <<"\n");
+      Starts.push_back(Ptr);
+      Ends.push_back(Ptr);
+    } else {
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+
+      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i],
+                                       PtrArithTy, Loc);
+      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
+      Starts.push_back(Start);
+      Ends.push_back(End);
+    }
+  }
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i+1; j < NumPointers; ++j) {
+      Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                    Starts[i], Ends[j], "bound0", Loc);
+      Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                    Starts[j], Ends[i], "bound1", Loc);
+      Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+                                                 "found.conflict", Loc);
+      if (MemoryRuntimeCheck) {
+        MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+                                                    MemoryRuntimeCheck,
+                                                    IsConflict,
+                                                    "conflict.rdx", Loc);
+      } else {
+        MemoryRuntimeCheck = IsConflict;
+      }
+    }
+  }
+
+  return MemoryRuntimeCheck;
+}
+
 void
 SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
@@ -791,56 +856,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                StartIdx,
                                "cmp.zero", Loc);
 
-  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
-    Legal->getRuntimePointerCheck();
-  Value *MemoryRuntimeCheck = 0;
-  if (PtrRtCheck->Need) {
-    unsigned NumPointers = PtrRtCheck->Pointers.size();
-    SmallVector<Value* , 2> Starts;
-    SmallVector<Value* , 2> Ends;
-
-    // Use this type for pointer arithmetic.
-    Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
-
-    for (unsigned i=0; i < NumPointers; ++i) {
-      Value *Ptr = PtrRtCheck->Pointers[i];
-      const SCEV *Sc = SE->getSCEV(Ptr);
-
-      if (SE->isLoopInvariant(Sc, OrigLoop)) {
-        DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
-              *Ptr <<"\n");
-        Starts.push_back(Ptr);
-        Ends.push_back(Ptr);
-      } else {
-        DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
-
-        Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i],
-                                         PtrArithTy, Loc);
-        Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
-        Starts.push_back(Start);
-        Ends.push_back(End);
-      }
-    }
-
-    for (unsigned i = 0; i < NumPointers; ++i) {
-      for (unsigned j = i+1; j < NumPointers; ++j) {
-        Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                      Starts[i], Ends[j], "bound0", Loc);
-        Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                      Starts[j], Ends[i], "bound1", Loc);
-        Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
-                                                    "found.conflict", Loc);
-        if (MemoryRuntimeCheck) {
-          MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
-                                                      MemoryRuntimeCheck,
-                                                      IsConflict,
-                                                      "conflict.rdx", Loc);
-        } else {
-          MemoryRuntimeCheck = IsConflict;
-        }
-      }
-    }
-  }// end of need-runtime-check code.
+  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc);  
 
   // If we are using memory runtime checks, include them in.
   if (MemoryRuntimeCheck) {
-- 
cgit v1.1


From 00e8074ee24449e5d0e50b626290b499b8bd78b1 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 25 Nov 2012 16:39:01 +0000
Subject: Fix the document style.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168569 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2ca5fea..9cba136 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -118,10 +118,10 @@ public:
 
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *Legal) {
-    ///Create a new empty loop. Unlink the old loop and connect the new one.
+    // Create a new empty loop. Unlink the old loop and connect the new one.
     createEmptyLoop(Legal);
-    /// Widen each instruction in the old loop to a new one in the new loop.
-    /// Use the Legality module to find the induction and reduction variables.
+    // Widen each instruction in the old loop to a new one in the new loop.
+    // Use the Legality module to find the induction and reduction variables.
     vectorizeLoop(Legal);
     // Register the new loop and update the analysis passes.
     updateAnalysis();
-- 
cgit v1.1


From 6bfc3481bd8995906af4c15131feeae665a197c6 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 25 Nov 2012 16:48:08 +0000
Subject: Move the max vector width to a constant parameter. No functionality
 change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168570 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9cba136..e79d526 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -86,6 +86,9 @@ const unsigned TinyTripCountThreshold = 16;
 /// number of pointers. Notice that the check is quadratic!
 const unsigned RuntimeMemoryCheckThreshold = 2;
 
+/// This is the highest vector width that we try to generate.
+const unsigned MaxVectorSize = 8;
+
 namespace {
 
 // Forward declarations.
@@ -130,7 +133,7 @@ public:
 private:
   /// Add code that checks at runtime if the accessed arrays overlap.
   /// Returns the comperator value or NULL if no check is needed.
-  Value*  addRuntimeCheck(LoopVectorizationLegality *Legal,
+  Value* addRuntimeCheck(LoopVectorizationLegality *Legal,
                           Instruction *Loc);
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
@@ -400,7 +403,7 @@ public:
   /// Returns the most profitable vectorization factor for the loop that is
   /// smaller or equal to the VF argument. This method checks every power
   /// of two up to VF.
-  unsigned findBestVectorizationFactor(unsigned VF = 8);
+  unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize);
 
 private:
   /// Returns the expected execution cost. The unit of the cost does
@@ -856,13 +859,12 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                StartIdx,
                                "cmp.zero", Loc);
 
-  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc);  
+  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc);
 
   // If we are using memory runtime checks, include them in.
-  if (MemoryRuntimeCheck) {
+  if (MemoryRuntimeCheck)
     Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
                                  "CntOrMem", Loc);
-  }
 
   BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
   // Remove the old terminator.
-- 
cgit v1.1


From 22689b6034dbde1c08e39b162fd59e1fd5d35b96 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 26 Nov 2012 19:51:46 +0000
Subject: Move the code that uses SCEVs prior to creating the new loops.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168601 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 67 ++++++++++++++++--------------
 1 file changed, 35 insertions(+), 32 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e79d526..55733f7 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -113,10 +113,10 @@ class SingleBlockLoopVectorizer {
 public:
   /// Ctor.
   SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                            DominatorTree *dt, DataLayout *dl,
+                            DominatorTree *Dt, DataLayout *Dl,
                             LPPassManager *Lpm,
                             unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), DL(dl), LPM(Lpm), VF(VecWidth),
+  OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), LPM(Lpm), VF(VecWidth),
   Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
@@ -133,8 +133,8 @@ public:
 private:
   /// Add code that checks at runtime if the accessed arrays overlap.
   /// Returns the comperator value or NULL if no check is needed.
-  Value* addRuntimeCheck(LoopVectorizationLegality *Legal,
-                          Instruction *Loc);
+  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                         Instruction *Loc);
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
@@ -179,7 +179,7 @@ private:
   LoopInfo *LI;
   // Dominator Tree.
   DominatorTree *DT;
-  // Data Layout;
+  // Data Layout.
   DataLayout *DL;
   // Loop Pass Manager;
   LPPassManager *LPM;
@@ -725,14 +725,14 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
                                     Starts[j], Ends[i], "bound1", Loc);
       Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
                                                  "found.conflict", Loc);
-      if (MemoryRuntimeCheck) {
+      if (MemoryRuntimeCheck)
         MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
                                                     MemoryRuntimeCheck,
                                                     IsConflict,
                                                     "conflict.rdx", Loc);
-      } else {
+      else
         MemoryRuntimeCheck = IsConflict;
-      }
+
     }
   }
 
@@ -770,6 +770,11 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    ...
    */
 
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(ExitBlock && "Must have an exit block");
+
   // Some loops have a single integer induction variable, while other loops
   // don't. One example is c++ iterators that often have multiple pointer
   // induction variables. In the code below we also support a case where we
@@ -786,10 +791,13 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   ExitCount = SE->getAddExpr(ExitCount,
                              SE->getConstant(ExitCount->getType(), 1));
 
-  // This is the original scalar-loop preheader.
-  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
-  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
-  assert(ExitBlock && "Must have an exit block");
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, "induction");
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                   BypassBlock->getTerminator());
 
   // The loop index does not have to start at Zero. Find the original start
   // value from the induction PHI node. If we don't have an induction variable
@@ -801,18 +809,23 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
 
+  // Generate the code that checks in runtime if arrays overlap.
+  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
+                                              BypassBlock->getTerminator());
+
+  // Split the single block loop into the two loop structure described above.
   BasicBlock *VectorPH =
       BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
-  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
-                                                 "vector.body");
-
-  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
-                                                  "middle.block");
+  BasicBlock *VecBody =
+    VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+  BasicBlock *MiddleBlock =
+    VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
   BasicBlock *ScalarPH =
-    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                 "scalar.preheader");
-  // Find the induction variable.
-  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+  // This is the location in which we add all of the logic for bypassing
+  // the new vector loop.
+  Instruction *Loc = BypassBlock->getTerminator();
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
@@ -822,14 +835,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
-  // Expand the trip count and place the new instructions in the preheader.
-  // Notice that the pre-header does not change, only the loop body.
-  SCEVExpander Exp(*SE, "induction");
-  Instruction *Loc = BypassBlock->getTerminator();
-
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), Loc);
-
   // We may need to extend the index in case there is a type mismatch.
   // We know that the count starts at zero and does not overflow.
   if (Count->getType() != IdxTy) {
@@ -859,8 +864,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                StartIdx,
                                "cmp.zero", Loc);
 
-  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc);
-
   // If we are using memory runtime checks, include them in.
   if (MemoryRuntimeCheck)
     Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
@@ -1053,7 +1056,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
           continue;
         }
 
-        // Handle pointer inductions:
+        // Handle pointer inductions.
         assert(P->getType()->isPointerTy() && "Unexpected type.");
         Value *StartIdx = OldInduction ?
           Legal->getInductionVars()->lookup(OldInduction) :
-- 
cgit v1.1


From dddaad624e388529f1f933939000d6dbc07c107d Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 29 Nov 2012 19:25:41 +0000
Subject: When broadcasting invariant scalars into vectors, place the broadcast
 code in the preheader.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168927 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 40 ++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 11 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 55733f7..35e2d05 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -516,6 +516,17 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
   LLVMContext &C = V->getContext();
   Type *VTy = VectorType::get(V->getType(), VF);
   Type *I32 = IntegerType::getInt32Ty(C);
+
+  // Save the current insertion location.
+  Instruction *Loc = Builder.GetInsertPoint();
+
+  // We need to place the broadcast of invariant variables outside the loop.
+  bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction);
+
+  // Place the code for broadcasting invariant variables in the new preheader.
+  if (Invariant)
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
   Constant *Zero = ConstantInt::get(I32, 0);
   Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
   Value *UndefVal = UndefValue::get(VTy);
@@ -524,10 +535,11 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
   // Broadcast the scalar into all locations in the vector.
   Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
                                              "broadcast");
-  // We are accessing the induction variable. Make sure to promote the
-  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
-  if (V == Induction)
-    return getConsecutiveVector(Shuf);
+
+  // Restore the builder insertion point.
+  if (Invariant)
+    Builder.SetInsertPoint(Loc);
+
   return Shuf;
 }
 
@@ -571,7 +583,7 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
       return false;
 
-  // We can emit wide load/stores only of the last index is the induction
+  // We can emit wide load/stores only if the last index is the induction
   // variable.
   const SCEV *Last = SE->getSCEV(LastIndex);
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
@@ -591,6 +603,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
 }
 
 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+  assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   // If we saved a vectorized copy of V, use it.
   Value *&MapEntry = WidenMap[V];
@@ -619,7 +632,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
     // If we are accessing the old induction variable, use the new one.
     if (SrcOp == OldInduction) {
-      Params.push_back(getVectorValue(Induction));
+      Params.push_back(getVectorValue(SrcOp));
       continue;
     }
 
@@ -697,7 +710,7 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   // Use this type for pointer arithmetic.
   Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
 
-  for (unsigned i=0; i < NumPointers; ++i) {
+  for (unsigned i = 0; i < NumPointers; ++i) {
     Value *Ptr = PtrRtCheck->Pointers[i];
     const SCEV *Sc = SE->getSCEV(Ptr);
 
@@ -1016,7 +1029,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
   // In order to support reduction variables we need to be able to vectorize
   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
-  // steages. First, we create a new vector PHI node with no incoming edges.
+  // stages. First, we create a new vector PHI node with no incoming edges.
   // We use this value when we vectorize all of the instructions that use the
   // PHI. Next, after all of the instructions in the block are complete we
   // add the new incoming edges to the PHI. At this point all of the
@@ -1052,7 +1065,12 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
         if (P->getType()->isIntegerTy()) {
           assert(P == OldInduction && "Unexpected PHI");
-          WidenMap[Inst] = getBroadcastInstrs(Induction);
+          Value *Broadcasted = getBroadcastInstrs(Induction);
+          // After broadcasting the induction variable we need to make the
+          // vector consecutive by adding 0, 1, 2 ...
+          Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
+           
+          WidenMap[OldInduction] = ConsecutiveInduction;
           continue;
         }
 
@@ -1387,7 +1405,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 }
 
 void SingleBlockLoopVectorizer::updateAnalysis() {
-  // The original basic block.
+  // Forget the original basic block.
   SE->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
@@ -1575,7 +1593,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
     Uniforms.insert(I);
 
     // Insert all operands.
-    for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
+    for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
       Worklist.push_back(I->getOperand(i));
     }
   }
-- 
cgit v1.1


From f735a7f88dcb87cca4622ce45effa8134759b7d8 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 30 Nov 2012 17:27:53 +0000
Subject: Remove the use of LPPassManager. We can remove LPM because we dont
 need to run any additional loop passes on the new vector loop.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169016 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 35e2d05..d55b7bd 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -114,9 +114,8 @@ public:
   /// Ctor.
   SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
                             DominatorTree *Dt, DataLayout *Dl,
-                            LPPassManager *Lpm,
                             unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), LPM(Lpm), VF(VecWidth),
+  OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
   Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
@@ -181,8 +180,6 @@ private:
   DominatorTree *DT;
   // Data Layout.
   DataLayout *DL;
-  // Loop Pass Manager;
-  LPPassManager *LPM;
   // The vectorization factor to use.
   unsigned VF;
 
@@ -491,7 +488,7 @@ struct LoopVectorize : public LoopPass {
           "\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, &LPM, VF);
+    SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, VF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -969,19 +966,22 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
-  // Register the new loop.
+  // Create and register the new vector loop.
   Loop* Lp = new Loop();
-  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
-
-  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
-
   Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  // Insert the new loop into the loop nest and register the new basic blocks.
   if (ParentLoop) {
+    ParentLoop->addChildLoop(Lp);
     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  } else {
+    LI->addTopLevelLoop(Lp);
   }
 
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
   // Save the state.
   LoopVectorPreHeader = VectorPH;
   LoopScalarPreHeader = ScalarPH;
-- 
cgit v1.1


From d6964741f52d2553517209319fa214afe8b3812b Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 30 Nov 2012 22:37:11 +0000
Subject: minor cleanups

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169048 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index d55b7bd..8be31dc 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -712,7 +712,7 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
     const SCEV *Sc = SE->getSCEV(Ptr);
 
     if (SE->isLoopInvariant(Sc, OrigLoop)) {
-      DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
             *Ptr <<"\n");
       Starts.push_back(Ptr);
       Ends.push_back(Ptr);
@@ -1423,11 +1423,7 @@ void SingleBlockLoopVectorizer::updateAnalysis() {
 }
 
 bool LoopVectorizationLegality::canVectorize() {
-  if (!TheLoop->getLoopPreheader()) {
-    assert(false && "No preheader!!");
-    DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
-    return false;
-  }
+  assert(TheLoop->getLoopPreheader() && "No preheader!!");
 
   // We can only vectorize single basic block loops.
   unsigned NumBlocks = TheLoop->getNumBlocks();
@@ -2008,9 +2004,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::AShr:
     case Instruction::And:
     case Instruction::Or:
-    case Instruction::Xor: {
+    case Instruction::Xor:
       return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
-    }
     case Instruction::Select: {
       SelectInst *SI = cast<SelectInst>(I);
       const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
-- 
cgit v1.1


From d04a8d4b33ff316ca4cf961e06c9e312eff8e64f Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 3 Dec 2012 16:50:05 +0000
Subject: Use the new script to sort the includes of every file under lib.

Sooooo many of these had incorrect or strange main module includes.
I have manually inspected all of these, and fixed the main module
include to be the nearest plausible thing I could find. If you own or
care about any of these source files, I encourage you to take some time
and check that these edits were sensible. I can't have broken anything
(I strictly added headers, and reordered them, never removed), but they
may not be the headers you'd really like to identify as containing the
API being implemented.

Many forward declarations and missing includes were added to a header
files to allow them to parse cleanly when included first. The main
module rule does in fact have its merits. =]

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169131 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 37 +++++++++++++++---------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8be31dc..33b76ba 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -43,35 +43,36 @@
 //===----------------------------------------------------------------------===//
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Pass.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Value.h"
-#include "llvm/Function.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
+#include "llvm/TargetTransformInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Type.h"
+#include "llvm/Value.h"
 #include <algorithm>
 using namespace llvm;
 
-- 
cgit v1.1


From dd8b1015c8e3dd8f7f9d6d5d63804d2e5ab89c20 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 3 Dec 2012 21:06:35 +0000
Subject: Add initial support for IF-conversion. This patch implements the
 first 1/3, which is the legality of the if-conversion transformation. The
 next step is to implement the cost-model for the if-converted code as well as
 the vectorization itself.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169152 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 387 +++++++++++++++++++----------
 1 file changed, 253 insertions(+), 134 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33b76ba..8e2538f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -80,6 +80,10 @@ static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
           cl::desc("Set the default vectorization width. Zero is autoselect."));
 
+static cl::opt<bool>
+EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden,
+                   cl::desc("Enable if-conversion during vectorization."));
+
 /// We don't vectorize loops with a known constant trip count below this number.
 const unsigned TinyTripCountThreshold = 16;
 
@@ -219,16 +223,17 @@ private:
 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
 ///   will change the order of memory accesses in a way that will change the
 ///   correctness of the program.
-/// * Scalars checks - The code in canVectorizeBlock checks for a number
-///   of different conditions, such as the availability of a single induction
-///   variable, that all types are supported and vectorize-able, etc.
-/// This code reflects the capabilities of SingleBlockLoopVectorizer.
+/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
+/// checks for a number of different conditions, such as the availability of a
+/// single induction variable, that all types are supported and vectorize-able,
+/// etc. This code reflects the capabilities of SingleBlockLoopVectorizer.
 /// This class is also used by SingleBlockLoopVectorizer for identifying
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
-  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
+  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
+                              DominatorTree *Dt):
+  TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { }
 
   /// This represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -277,7 +282,7 @@ public:
       const SCEV *Sc = SE->getSCEV(Ptr);
       const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
       assert(AR && "Invalid addrec expression");
-      const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader());
+      const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
       const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
       Pointers.push_back(Ptr);
       Starts.push_back(AR->getStart());
@@ -334,13 +339,28 @@ private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
   /// and we only need to check individual instructions.
-  bool canVectorizeBlock(BasicBlock &BB);
+  bool canVectorizeInstrs(BasicBlock &BB);
 
   /// When we vectorize loops we may change the order in which
   /// we read and write from memory. This method checks if it is
   /// legal to vectorize the code, considering only memory constrains.
   /// Returns true if BB is vectorizable
-  bool canVectorizeMemory(BasicBlock &BB);
+  bool canVectorizeMemory();
+
+  /// Return true if we can vectorize this loop using the IF-conversion
+  /// transformation.
+  bool canVectorizeWithIfConvert();
+
+  /// Collect the variables that need to stay uniform after vectorization.
+  void collectLoopUniforms();
+
+  /// Return true if the block BB needs to be predicated in order for the loop
+  /// to be vectorized.
+  bool blockNeedsPredication(BasicBlock *BB);
+
+  /// return true if all of the instructions in the block can be speculatively
+  /// executed.
+  bool blockCanBePredicated(BasicBlock *BB);
 
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
@@ -359,6 +379,8 @@ private:
   ScalarEvolution *SE;
   /// DataLayout analysis.
   DataLayout *DL;
+  // Dominators.
+  DominatorTree *DT;
 
   //  ---  vectorization state --- //
 
@@ -458,7 +480,7 @@ struct LoopVectorize : public LoopPass {
           L->getHeader()->getParent()->getName() << "\"\n");
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL);
+    LoopVectorizationLegality LVL(L, SE, DL, DT);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
@@ -1423,41 +1445,91 @@ void SingleBlockLoopVectorizer::updateAnalysis() {
   DEBUG(DT->verifyAnalysis());
 }
 
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+  if (!EnableIfConversion)
+    return false;
+
+  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
+
+  // Collect the blocks that need predication.
+  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
+    BasicBlock *BB = LoopBlocks[i];
+
+    // We must have at most two predecessors because we need to convert
+    // all PHIs to selects.
+    unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
+    if (Preds > 2)
+      return false;
+
+    // We must be able to predicate all blocks that needs to be predicated.
+    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
+      return false;
+  }
+
+  // We can if-convert this loop.
+  return true;
+}
+
 bool LoopVectorizationLegality::canVectorize() {
   assert(TheLoop->getLoopPreheader() && "No preheader!!");
 
-  // We can only vectorize single basic block loops.
+  // We can only vectorize innermost loops.
+  if (TheLoop->getSubLoopsVector().size())
+    return false;
+
+  // We must have a single backedge.
+  if (TheLoop->getNumBackEdges() != 1)
+    return false;
+
+  // We must have a single exiting block.
+  if (!TheLoop->getExitingBlock())
+    return false;
+
   unsigned NumBlocks = TheLoop->getNumBlocks();
-  if (NumBlocks != 1) {
-    DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
+
+  // Check if we can if-convert non single-bb loops.
+  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+    DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
     return false;
   }
 
   // We need to have a loop header.
-  BasicBlock *BB = TheLoop->getHeader();
-  DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
+  BasicBlock *Header = TheLoop->getHeader();
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+  DEBUG(dbgs() << "LV: Found a loop: " << Header->getName() << "\n");
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
+  const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
   if (ExitCount == SE->getCouldNotCompute()) {
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
 
   // Do not loop-vectorize loops with a tiny trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
   if (TC > 0u && TC < TinyTripCountThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
           "This loop is not worth vectorizing.\n");
     return false;
   }
 
+  // Check if we can vectorize the instructions and CFG in this loop.
+  if (!canVectorizeInstrs(*Header)) {
+    DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+    return false;
+  }
+
   // Go over each instruction and look at memory deps.
-  if (!canVectorizeBlock(*BB)) {
-    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+  if (!canVectorizeMemory()) {
+    DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
     return false;
   }
 
+  // Collect all of the variables that remain  uniform after vectorization.
+  collectLoopUniforms();
+
   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
         (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
         <<"!\n");
@@ -1468,122 +1540,138 @@ bool LoopVectorizationLegality::canVectorize() {
   return true;
 }
 
-bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
-
+bool LoopVectorizationLegality::canVectorizeInstrs(BasicBlock &BB) {
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+  BasicBlock *Header = TheLoop->getHeader();
 
-  // Scan the instructions in the block and look for hazards.
-  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-    Instruction *I = it;
+  // For each block in the loop
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
 
-    if (PHINode *Phi = dyn_cast<PHINode>(I)) {
-      // This should not happen because the loop should be normalized.
-      if (Phi->getNumIncomingValues() != 2) {
-        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
-        return false;
-      }
+    // Scan the instructions in the block and look for hazards.
+    for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+      Instruction *I = it;
 
-      // This is the value coming from the preheader.
-      Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+      if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+        // This should not happen because the loop should be normalized.
+        if (Phi->getNumIncomingValues() != 2) {
+          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+          return false;
+        }
 
-      // We only look at integer and pointer phi nodes.
-      if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
-        DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
-        Inductions[Phi] = StartValue;
-        continue;
-      } else if (!Phi->getType()->isIntegerTy()) {
-        DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
-        return false;
-      }
+        // If this PHINode is not in the header block, then we know that we
+        // can convert it to select during if-conversion.
+        if (*bb != Header) {
+          continue;
+        }
 
-      // Handle integer PHIs:
-      if (isInductionVariable(Phi)) {
-        if (Induction) {
-          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+        // This is the value coming from the preheader.
+        Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+
+        // We only look at integer and pointer phi nodes.
+        if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
+          DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
+          Inductions[Phi] = StartValue;
+          continue;
+        } else if (!Phi->getType()->isIntegerTy()) {
+          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
         }
-        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
-        Induction = Phi;
-        Inductions[Phi] = StartValue;
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerAdd)) {
-        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerMult)) {
-        DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerOr)) {
-        DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerAnd)) {
-        DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerXor)) {
-        DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
 
-      DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
-      return false;
-    }// end of PHI handling
+        // Handle integer PHIs:
+        if (isInductionVariable(Phi)) {
+          if (Induction) {
+            DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+            return false;
+          }
+          DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
+          Induction = Phi;
+          Inductions[Phi] = StartValue;
+          continue;
+        }
+        if (AddReductionVar(Phi, IntegerAdd)) {
+          DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, IntegerMult)) {
+          DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, IntegerOr)) {
+          DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, IntegerAnd)) {
+          DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, IntegerXor)) {
+          DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
 
-    // We still don't handle functions.
-    CallInst *CI = dyn_cast<CallInst>(I);
-    if (CI) {
-      DEBUG(dbgs() << "LV: Found a call site.\n");
-      return false;
-    }
+        DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+        return false;
+      }// end of PHI handling
 
-    // We do not re-vectorize vectors.
-    if (!VectorType::isValidElementType(I->getType()) &&
-        !I->getType()->isVoidTy()) {
-      DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
-      return false;
-    }
+      // We still don't handle functions.
+      CallInst *CI = dyn_cast<CallInst>(I);
+      if (CI) {
+        DEBUG(dbgs() << "LV: Found a call site.\n");
+        return false;
+      }
 
-    // Reduction instructions are allowed to have exit users.
-    // All other instructions must not have external users.
-    if (!AllowedExit.count(I))
-      //Check that all of the users of the loop are inside the BB.
-      for (Value::use_iterator it = I->use_begin(), e = I->use_end();
-           it != e; ++it) {
-        Instruction *U = cast<Instruction>(*it);
-        // This user may be a reduction exit value.
-        BasicBlock *Parent = U->getParent();
-        if (Parent != &BB) {
-          DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
-          return false;
+      // We do not re-vectorize vectors.
+      if (!VectorType::isValidElementType(I->getType()) &&
+          !I->getType()->isVoidTy()) {
+        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+        return false;
+      }
+
+      // Reduction instructions are allowed to have exit users.
+      // All other instructions must not have external users.
+      if (!AllowedExit.count(I))
+        //Check that all of the users of the loop are inside the BB.
+        for (Value::use_iterator it = I->use_begin(), e = I->use_end();
+             it != e; ++it) {
+          Instruction *U = cast<Instruction>(*it);
+          // This user may be a reduction exit value.
+          if (!TheLoop->contains(U)) {
+            DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+            return false;
+          }
         }
-    }
-  } // next instr.
+    } // next instr.
+
+  }
 
   if (!Induction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
     assert(getInductionVars()->size() && "No induction variables");
   }
 
-  // Don't vectorize if the memory dependencies do not allow vectorization.
-  if (!canVectorizeMemory(BB))
-    return false;
+  return true;
+}
 
+void LoopVectorizationLegality::collectLoopUniforms() {
   // We now know that the loop is vectorizable!
   // Collect variables that will remain uniform after vectorization.
   std::vector<Value*> Worklist;
 
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+
   // Start with the conditional branch and walk up the block.
-  Worklist.push_back(BB.getTerminator()->getOperand(0));
+  Worklist.push_back(Latch->getTerminator()->getOperand(0));
 
   while (Worklist.size()) {
     Instruction *I = dyn_cast<Instruction>(Worklist.back());
     Worklist.pop_back();
 
-    // Look at instructions inside this block. Stop when reaching PHI nodes.
-    if (!I || I->getParent() != &BB || isa<PHINode>(I))
+    // Look at instructions inside this loop.
+    // Stop when reaching PHI nodes.
+    // TODO: we need to prevent loops but we do need to follow PHIs inside this
+    // loop.
+    if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
       continue;
 
     // This is a known uniform.
@@ -1594,11 +1682,9 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
       Worklist.push_back(I->getOperand(i));
     }
   }
-
-  return true;
 }
 
-bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
+bool LoopVectorizationLegality::canVectorizeMemory() {
   typedef SmallVector<Value*, 16> ValueVector;
   typedef SmallPtrSet<Value*, 16> ValueSet;
   // Holds the Load and Store *instructions*.
@@ -1607,35 +1693,40 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   PtrRtCheck.Pointers.clear();
   PtrRtCheck.Need = false;
 
-  // Scan the BB and collect legal loads and stores.
-  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-    Instruction *I = it;
-
-    // If this is a load, save it. If this instruction can read from memory
-    // but is not a load, then we quit. Notice that we don't handle function
-    // calls that read or write.
-    if (I->mayReadFromMemory()) {
-      LoadInst *Ld = dyn_cast<LoadInst>(I);
-      if (!Ld) return false;
-      if (!Ld->isSimple()) {
-        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
-        return false;
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+
+      // If this is a load, save it. If this instruction can read from memory
+      // but is not a load, then we quit. Notice that we don't handle function
+      // calls that read or write.
+      if (it->mayReadFromMemory()) {
+        LoadInst *Ld = dyn_cast<LoadInst>(it);
+        if (!Ld) return false;
+        if (!Ld->isSimple()) {
+          DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+          return false;
+        }
+        Loads.push_back(Ld);
+        continue;
       }
-      Loads.push_back(Ld);
-      continue;
-    }
 
-    // Save store instructions. Abort if other instructions write to memory.
-    if (I->mayWriteToMemory()) {
-      StoreInst *St = dyn_cast<StoreInst>(I);
-      if (!St) return false;
-      if (!St->isSimple()) {
-        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
-        return false;
+      // Save 'store' instructions. Abort if other instructions write to memory.
+      if (it->mayWriteToMemory()) {
+        StoreInst *St = dyn_cast<StoreInst>(it);
+        if (!St) return false;
+        if (!St->isSimple()) {
+          DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+          return false;
+        }
+        Stores.push_back(St);
       }
-      Stores.push_back(St);
-    }
-  } // next instr.
+    } // next instr.
+  } // next block.
 
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
@@ -1908,6 +1999,34 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   return (C->getValue()->equalsInt(Size));
 }
 
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
+  assert(TheLoop->contains(BB) && "Unknown block used");
+
+  // Blocks that do not dominate the latch need predication.
+  BasicBlock* Latch = TheLoop->getLoopLatch();
+  return !DT->dominates(BB, Latch);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    // We don't predicate loads/stores at the moment.
+    if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
+      return false;
+
+    // The isntructions below can trap.
+    switch (it->getOpcode()) {
+      default: continue;
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+        return false;
+    }
+  }
+
+  return true;
+}
+
 bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
   const SCEV *PhiScev = SE->getSCEV(Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
-- 
cgit v1.1


From 56f1ce22dc80cde198afb3ed153c109b951217fa Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 3 Dec 2012 21:33:08 +0000
Subject: Now that we have a basic if-conversion infrastructure we can rename
 the "single basic block loop vectorizer" to "innermost loop vectorizer".

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169158 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 36 +++++++++++++++---------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8e2538f..ecf19b7 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -20,7 +20,7 @@
 // 1. The main loop pass that drives the different parts.
 // 2. LoopVectorizationLegality - A unit that checks for the legality
 //    of the vectorization.
-// 3. SingleBlockLoopVectorizer - A unit that performs the actual
+// 3. InnerLoopVectorizer - A unit that performs the actual
 //    widening of instructions.
 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
 //    of vectorization. It decides on the optimal vector width, which
@@ -100,7 +100,7 @@ namespace {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 
-/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
 /// scalars. This class also implements the following features:
@@ -109,15 +109,15 @@ class LoopVectorizationCostModel;
 /// * It handles the code generation for reduction variables.
 /// * Scalarization (implementation using scalars) of un-vectorizable
 ///   instructions.
-/// SingleBlockLoopVectorizer does not perform any vectorization-legality
+/// InnerLoopVectorizer does not perform any vectorization-legality
 /// checks, and relies on the caller to check for the different legality
-/// aspects. The SingleBlockLoopVectorizer relies on the
+/// aspects. The InnerLoopVectorizer relies on the
 /// LoopVectorizationLegality class to provide information about the induction
 /// and reduction variables that were found to a given vectorization factor.
-class SingleBlockLoopVectorizer {
+class InnerLoopVectorizer {
 public:
   /// Ctor.
-  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
+  InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
                             DominatorTree *Dt, DataLayout *Dl,
                             unsigned VecWidth):
   OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
@@ -226,8 +226,8 @@ private:
 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
 /// checks for a number of different conditions, such as the availability of a
 /// single induction variable, that all types are supported and vectorize-able,
-/// etc. This code reflects the capabilities of SingleBlockLoopVectorizer.
-/// This class is also used by SingleBlockLoopVectorizer for identifying
+/// etc. This code reflects the capabilities of InnerLoopVectorizer.
+/// This class is also used by InnerLoopVectorizer for identifying
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
@@ -511,7 +511,7 @@ struct LoopVectorize : public LoopPass {
           "\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, VF);
+    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -531,7 +531,7 @@ struct LoopVectorize : public LoopPass {
 
 };
 
-Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   // Create the types.
   LLVMContext &C = V->getContext();
   Type *VTy = VectorType::get(V->getType(), VF);
@@ -563,7 +563,7 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
   assert(Val->getType()->getScalarType()->isIntegerTy() &&
          "Elem must be an integer");
@@ -622,7 +622,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
   return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
 }
 
-Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+Value *InnerLoopVectorizer::getVectorValue(Value *V) {
   assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   // If we saved a vectorized copy of V, use it.
@@ -637,11 +637,11 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
 }
 
 Constant*
-SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
+InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
   return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
 }
 
-void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
   SmallVector<Value*, 8> Params;
@@ -712,7 +712,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 }
 
 Value*
-SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
                                            Instruction *Loc) {
   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
     Legal->getRuntimePointerCheck();
@@ -773,7 +773,7 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
 }
 
 void
-SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -1037,7 +1037,7 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
 }
 
 void
-SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
   //
   // Notice: any optimization or new instruction that go
@@ -1427,7 +1427,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   }// end of for each redux variable.
 }
 
-void SingleBlockLoopVectorizer::updateAnalysis() {
+void InnerLoopVectorizer::updateAnalysis() {
   // Forget the original basic block.
   SE->forgetLoop(OrigLoop);
 
-- 
cgit v1.1


From fa72ee729a989ff340672034fd77832c1fd76326 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 3 Dec 2012 22:46:31 +0000
Subject: IF-conversion: teach the cost-model how to grade if-converted loops.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169171 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 41 +++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 15 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ecf19b7..efbf0fd 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -321,6 +321,10 @@ public:
   /// Returns the induction variables found in the loop.
   InductionList *getInductionVars() { return &Inductions; }
 
+  /// Return true if the block BB needs to be predicated in order for the loop
+  /// to be vectorized.
+  bool blockNeedsPredication(BasicBlock *BB);
+
   /// Check if this  pointer is consecutive when vectorizing. This happens
   /// when the last index of the GEP is the induction variable, or that the
   /// pointer itself is an induction variable.
@@ -354,10 +358,6 @@ private:
   /// Collect the variables that need to stay uniform after vectorization.
   void collectLoopUniforms();
 
-  /// Return true if the block BB needs to be predicated in order for the loop
-  /// to be vectorized.
-  bool blockNeedsPredication(BasicBlock *BB);
-
   /// return true if all of the instructions in the block can be speculatively
   /// executed.
   bool blockCanBePredicated(BasicBlock *BB);
@@ -2064,19 +2064,29 @@ LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
 }
 
 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
-  // We can only estimate the cost of single basic block loops.
-  assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
-
-  BasicBlock *BB = TheLoop->getHeader();
   unsigned Cost = 0;
 
-  // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    Instruction *Inst = it;
-    unsigned C = getInstructionCost(Inst, VF);
-    Cost += C;
-    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
-          " For instruction: "<< *Inst << "\n");
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    unsigned BlockCost = 0;
+    BasicBlock *BB = *bb;
+    
+    // For each instruction in the old loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+
+      unsigned C = getInstructionCost(it, VF);
+      Cost += C;
+      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
+            VF << " For instruction: "<< *it << "\n");
+    }
+
+    // TODO: if-converted blocks can have a high-nest level. We need to
+    // calculate the loop nest level and multiply the cost accordingly.
+    if (Legal->blockNeedsPredication(*bb))
+      BlockCost *= 2;
+    
+    Cost += BlockCost;
   }
 
   return Cost;
@@ -2106,6 +2116,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       return VTTI->getCFInstrCost(I->getOpcode());
     }
     case Instruction::PHI:
+      //TODO: IF-converted IFs become selects.
       return 0;
     case Instruction::Add:
     case Instruction::FAdd:
-- 
cgit v1.1


From b8f842dce47e745b37505f156854755d8ad4c929 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 3 Dec 2012 22:57:09 +0000
Subject: minor renaming, documentation and cleanups.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169175 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 46 ++++++++++++++----------------
 1 file changed, 21 insertions(+), 25 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index efbf0fd..4dfe906 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -118,8 +118,7 @@ class InnerLoopVectorizer {
 public:
   /// Ctor.
   InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                            DominatorTree *Dt, DataLayout *Dl,
-                            unsigned VecWidth):
+                      DominatorTree *Dt, DataLayout *Dl, unsigned VecWidth):
   OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
   Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
@@ -343,12 +342,12 @@ private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
   /// and we only need to check individual instructions.
-  bool canVectorizeInstrs(BasicBlock &BB);
+  bool canVectorizeInstrs();
 
   /// When we vectorize loops we may change the order in which
   /// we read and write from memory. This method checks if it is
   /// legal to vectorize the code, considering only memory constrains.
-  /// Returns true if BB is vectorizable
+  /// Returns true if the loop is vectorizable
   bool canVectorizeMemory();
 
   /// Return true if we can vectorize this loop using the IF-conversion
@@ -358,7 +357,7 @@ private:
   /// Collect the variables that need to stay uniform after vectorization.
   void collectLoopUniforms();
 
-  /// return true if all of the instructions in the block can be speculatively
+  /// Return true if all of the instructions in the block can be speculatively
   /// executed.
   bool blockCanBePredicated(BasicBlock *BB);
 
@@ -1463,7 +1462,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     if (Preds > 2)
       return false;
 
-    // We must be able to predicate all blocks that needs to be predicated.
+    // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
       return false;
   }
@@ -1516,7 +1515,7 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // Check if we can vectorize the instructions and CFG in this loop.
-  if (!canVectorizeInstrs(*Header)) {
+  if (!canVectorizeInstrs()) {
     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
     return false;
   }
@@ -1527,7 +1526,7 @@ bool LoopVectorizationLegality::canVectorize() {
     return false;
   }
 
-  // Collect all of the variables that remain  uniform after vectorization.
+  // Collect all of the variables that remain uniform after vectorization.
   collectLoopUniforms();
 
   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
@@ -1540,19 +1539,19 @@ bool LoopVectorizationLegality::canVectorize() {
   return true;
 }
 
-bool LoopVectorizationLegality::canVectorizeInstrs(BasicBlock &BB) {
+bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
   BasicBlock *Header = TheLoop->getHeader();
 
-  // For each block in the loop
+  // For each block in the loop.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
        be = TheLoop->block_end(); bb != be; ++bb) {
 
     // Scan the instructions in the block and look for hazards.
-    for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-      Instruction *I = it;
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
 
-      if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+      if (PHINode *Phi = dyn_cast<PHINode>(it)) {
         // This should not happen because the loop should be normalized.
         if (Phi->getNumIncomingValues() != 2) {
           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
@@ -1561,9 +1560,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs(BasicBlock &BB) {
 
         // If this PHINode is not in the header block, then we know that we
         // can convert it to select during if-conversion.
-        if (*bb != Header) {
+        if (*bb != Header)
           continue;
-        }
 
         // This is the value coming from the preheader.
         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
@@ -1615,26 +1613,26 @@ bool LoopVectorizationLegality::canVectorizeInstrs(BasicBlock &BB) {
       }// end of PHI handling
 
       // We still don't handle functions.
-      CallInst *CI = dyn_cast<CallInst>(I);
+      CallInst *CI = dyn_cast<CallInst>(it);
       if (CI) {
         DEBUG(dbgs() << "LV: Found a call site.\n");
         return false;
       }
 
       // We do not re-vectorize vectors.
-      if (!VectorType::isValidElementType(I->getType()) &&
-          !I->getType()->isVoidTy()) {
+      if (!VectorType::isValidElementType(it->getType()) &&
+          !it->getType()->isVoidTy()) {
         DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
         return false;
       }
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
-      if (!AllowedExit.count(I))
+      if (!AllowedExit.count(it))
         //Check that all of the users of the loop are inside the BB.
-        for (Value::use_iterator it = I->use_begin(), e = I->use_end();
-             it != e; ++it) {
-          Instruction *U = cast<Instruction>(*it);
+        for (Value::use_iterator I = it->use_begin(), E = it->use_end();
+             I != E; ++I) {
+          Instruction *U = cast<Instruction>(*I);
           // This user may be a reduction exit value.
           if (!TheLoop->contains(U)) {
             DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
@@ -1657,7 +1655,6 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // We now know that the loop is vectorizable!
   // Collect variables that will remain uniform after vectorization.
   std::vector<Value*> Worklist;
-
   BasicBlock *Latch = TheLoop->getLoopLatch();
 
   // Start with the conditional branch and walk up the block.
@@ -1669,8 +1666,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
 
     // Look at instructions inside this loop.
     // Stop when reaching PHI nodes.
-    // TODO: we need to prevent loops but we do need to follow PHIs inside this
-    // loop.
+    // TODO: we need to follow values all over the loop, not only in this block.
     if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
       continue;
 
-- 
cgit v1.1


From 36422d887672dc9a21d9add99c675bdc9e4807dd Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 4 Dec 2012 00:49:28 +0000
Subject: Fix whitespace.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169194 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4dfe906..b720c97 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1091,7 +1091,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
           // After broadcasting the induction variable we need to make the
           // vector consecutive by adding 0, 1, 2 ...
           Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
-           
+
           WidenMap[OldInduction] = ConsecutiveInduction;
           continue;
         }
@@ -2067,7 +2067,7 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
        be = TheLoop->block_end(); bb != be; ++bb) {
     unsigned BlockCost = 0;
     BasicBlock *BB = *bb;
-    
+
     // For each instruction in the old loop.
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
 
@@ -2081,7 +2081,7 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
     // calculate the loop nest level and multiply the cost accordingly.
     if (Legal->blockNeedsPredication(*bb))
       BlockCost *= 2;
-    
+
     Cost += BlockCost;
   }
 
@@ -2265,4 +2265,3 @@ namespace llvm {
     return new LoopVectorize();
   }
 }
-
-- 
cgit v1.1


From 915584ed136c29b8fd9304268967f3f359befbb0 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 4 Dec 2012 00:49:34 +0000
Subject: LoopVectorize.cpp: Suppress a warning. [-Wunused-variable]

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169195 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index b720c97..0e33228 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1495,9 +1495,9 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // We need to have a loop header.
-  BasicBlock *Header = TheLoop->getHeader();
   BasicBlock *Latch = TheLoop->getLoopLatch();
-  DEBUG(dbgs() << "LV: Found a loop: " << Header->getName() << "\n");
+  DEBUG(dbgs() << "LV: Found a loop: " <<
+        TheLoop->getHeader()->getName() << "\n");
 
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
-- 
cgit v1.1


From 319d594e22c3db55114f233ca398f9760e3f6ed9 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 4 Dec 2012 06:15:11 +0000
Subject: Add the last part that is needed for vectorization of if-converted
 code. Added the code that actually performs the if-conversion during
 vectorization.

We can now vectorize this code:

for (int i=0; i<n; ++i) {
  unsigned k = 0;

  if (a[i] > b[i])   <------ IF inside the loop.
    k = k * 5 + 3;

  a[i] = k;          <---- K is a phi node that becomes vector-select.
}



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169217 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 413 ++++++++++++++++++-----------
 1 file changed, 251 insertions(+), 162 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0e33228..f538e08 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -134,6 +135,9 @@ public:
  }
 
 private:
+  /// A small list of PHINodes.
+  typedef SmallVector<PHINode*, 4> PhiVector;
+
   /// Add code that checks at runtime if the accessed arrays overlap.
   /// Returns the comperator value or NULL if no check is needed.
   Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
@@ -142,6 +146,19 @@ private:
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop(LoopVectorizationLegality *Legal);
+
+  /// A helper function that computes the predicate of the block BB, assuming
+  /// that the header block of the loop is set to True. It returns the *entry*
+  /// mask for the block BB.
+  Value *createBlockInMask(BasicBlock *BB);
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  Value *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
+  /// A helper function to vectorize a single BB within the innermost loop.
+  void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
+                            PhiVector *PV);
+
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
   void updateAnalysis();
@@ -816,7 +833,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
     DL->getIntPtrType(SE->getContext());
 
   // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
   // Get the total trip count from the count by adding 1.
@@ -838,7 +855,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
     OldInduction->getIncomingValueForBlock(BypassBlock):
     ConstantInt::get(IdxTy, 0);
 
-  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
 
   // Generate the code that checks in runtime if arrays overlap.
@@ -1044,7 +1060,6 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // the cost-model.
   //
   //===------------------------------------------------===//
-  typedef SmallVector<PHINode*, 4> PhiVector;
   BasicBlock &BB = *OrigLoop->getHeader();
   Constant *Zero = ConstantInt::get(
     IntegerType::getInt32Ty(BB.getContext()), 0);
@@ -1059,24 +1074,220 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // construct the PHI.
   PhiVector RdxPHIsToFix;
 
-  // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-    Instruction *Inst = it;
+  // Scan the loop in a topological order to ensure that defs are vectorized
+  // before users.
+  LoopBlocksDFS DFS(OrigLoop);
+  DFS.perform(LI);
+
+  // Vectorize all of the blocks in the original loop.
+  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+       be = DFS.endRPO(); bb != be; ++bb)
+    vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
+
+  // At this point every instruction in the original loop is widened to
+  // a vector form. We are almost done. Now, we need to fix the PHI nodes
+  // that we vectorized. The PHI nodes are currently empty because we did
+  // not want to introduce cycles. Notice that the remaining PHI nodes
+  // that we need to fix are reduction variables.
+
+  // Create the 'reduced' values for each of the induction vars.
+  // The reduced values are the vector values that we scalarize and combine
+  // after the loop is finished.
+  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
+       it != e; ++it) {
+    PHINode *RdxPhi = *it;
+    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
+    assert(RdxPhi && "Unable to recover vectorized PHI");
+
+    // Find the reduction variable descriptor.
+    assert(Legal->getReductionVars()->count(RdxPhi) &&
+           "Unable to find the reduction variable");
+    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
+      (*Legal->getReductionVars())[RdxPhi];
+
+    // We need to generate a reduction vector from the incoming scalar.
+    // To do so, we need to generate the 'identity' vector and overide
+    // one of the elements with the incoming scalar reduction. We need
+    // to do it in the vector-loop preheader.
+    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+
+    // This is the vector-clone of the value that leaves the loop.
+    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
+    Type *VecTy = VectorExit->getType();
+
+    // Find the reduction identity variable. Zero for addition, or, xor,
+    // one for multiplication, -1 for And.
+    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
+                                          VecTy->getScalarType());
+
+    // This vector is the Identity vector where the first element is the
+    // incoming scalar reduction.
+    Value *VectorStart = Builder.CreateInsertElement(Identity,
+                                                    RdxDesc.StartValue, Zero);
+
+    // Fix the vector-loop phi.
+    // We created the induction variable so we know that the
+    // preheader is the first entry.
+    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
+
+    // Reductions do not have to start at zero. They can start with
+    // any loop invariant values.
+    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
+    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
+    VecRdxPhi->addIncoming(Val, LoopVectorBody);
+
+    // Before each round, move the insertion point right between
+    // the PHIs and the values we are going to write.
+    // This allows us to write both PHINodes and the extractelement
+    // instructions.
+    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+
+    // This PHINode contains the vectorized reduction variable, or
+    // the initial value vector, if we bypass the vector loop.
+    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
+    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
+
+    // Extract the first scalar.
+    Value *Scalar0 =
+      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
+    // Extract and reduce the remaining vector elements.
+    for (unsigned i=1; i < VF; ++i) {
+      Value *Scalar1 =
+        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
+      switch (RdxDesc.Kind) {
+        case LoopVectorizationLegality::IntegerAdd:
+          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerMult:
+          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerOr:
+          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerAnd:
+          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerXor:
+          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
+          break;
+        default:
+          llvm_unreachable("Unknown reduction operation");
+      }
+    }
+
+    // Now, we need to fix the users of the reduction variable
+    // inside and outside of the scalar remainder loop.
+    // We know that the loop is in LCSSA form. We need to update the
+    // PHI nodes in the exit blocks.
+    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+      if (!LCSSAPhi) continue;
+
+      // All PHINodes need to have a single entry edge, or two if
+      // we already fixed them.
+      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+      // We found our reduction value exit-PHI. Update it with the
+      // incoming bypass edge.
+      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
+        // Add an edge coming from the bypass.
+        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        break;
+      }
+    }// end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
+  }// end of for each redux variable.
+}
+
+Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+  assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
+         "Invalid edge");
+
+  Value *SrcMask = createBlockInMask(Src);
+
+  // The terminator has to be a branch inst!
+  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+  assert(BI && "Unexpected terminator found");
+
+  Value *EdgeMask = SrcMask;
+  if (BI->isConditional()) {
+    EdgeMask = getVectorValue(BI->getCondition());
+    if (BI->getSuccessor(0) != Dst)
+      EdgeMask = Builder.CreateNot(EdgeMask);
+  }
+
+  return Builder.CreateAnd(EdgeMask, SrcMask);
+}
 
-    switch (Inst->getOpcode()) {
+Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+  // Loop incoming mask is all-one.
+  if (OrigLoop->getHeader() == BB)
+    return getVectorValue(
+      ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1));
+
+  // This is the block mask. We OR all incoming edges, and with zero.
+  Value *BlockMask = getVectorValue(
+    ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0));
+
+  // For each pred:
+  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it)
+    BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB));
+
+  return BlockMask;
+}
+
+void
+InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
+                                            BasicBlock *BB, PhiVector *PV) {
+  Constant *Zero =
+  ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0);
+
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    switch (it->getOpcode()) {
       case Instruction::Br:
         // Nothing to do for PHIs and BR, since we already took care of the
         // loop control flow instructions.
         continue;
       case Instruction::PHI:{
-        PHINode* P = cast<PHINode>(Inst);
+        PHINode* P = cast<PHINode>(it);
         // Handle reduction variables:
         if (Legal->getReductionVars()->count(P)) {
           // This is phase one of vectorizing PHIs.
-          Type *VecTy = VectorType::get(Inst->getType(), VF);
-          WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi",
-                                  LoopVectorBody->getFirstInsertionPt());
-          RdxPHIsToFix.push_back(P);
+          Type *VecTy = VectorType::get(it->getType(), VF);
+          WidenMap[it] =
+          PHINode::Create(VecTy, 2, "vec.phi",
+                          LoopVectorBody->getFirstInsertionPt());
+          PV->push_back(P);
+          continue;
+        }
+
+        // Check for PHI nodes that are lowered to vector selects.
+        if (P->getParent() != OrigLoop->getHeader()) {
+          // We know that all PHIs in non header blocks are converted into
+          // selects, so we don't have to worry about the insertion order and we
+          // can just use the builder.
+
+          // At this point we generate the predication tree. There may be
+          // duplications since this is a simple recursive scan, but future
+          // optimizations will clean it up.
+          Value *Cond = createBlockInMask(P->getIncomingBlock(0));
+          WidenMap[P] =
+            Builder.CreateSelect(Cond,
+                                 getVectorValue(P->getIncomingValue(0)),
+                                 getVectorValue(P->getIncomingValue(1)),
+                                 "predphi");
           continue;
         }
 
@@ -1099,8 +1310,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // Handle pointer inductions.
         assert(P->getType()->isPointerTy() && "Unexpected type.");
         Value *StartIdx = OldInduction ?
-          Legal->getInductionVars()->lookup(OldInduction) :
-          ConstantInt::get(Induction->getType(), 0);
+        Legal->getInductionVars()->lookup(OldInduction) :
+        ConstantInt::get(Induction->getType(), 0);
 
         // This is the pointer value coming into the loop.
         Value *StartPtr = Legal->getInductionVars()->lookup(P);
@@ -1121,7 +1332,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                                "insert.gep");
         }
 
-        WidenMap[Inst] = VecVal;
+        WidenMap[it] = VecVal;
         continue;
       }
       case Instruction::Add:
@@ -1143,13 +1354,13 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       case Instruction::Or:
       case Instruction::Xor: {
         // Just widen binops.
-        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Value *B = getVectorValue(Inst->getOperand(1));
+        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
+        Value *A = getVectorValue(it->getOperand(0));
+        Value *B = getVectorValue(it->getOperand(1));
 
         // Use this vector value for all users of the original instruction.
         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
-        WidenMap[Inst] = V;
+        WidenMap[it] = V;
 
         // Update the NSW, NUW and Exact flags.
         BinaryOperator *VecOp = cast<BinaryOperator>(V);
@@ -1165,7 +1376,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // Widen selects.
         // If the selector is loop invariant we can create a select
         // instruction with a scalar condition. Otherwise, use vector-select.
-        Value *Cond = Inst->getOperand(0);
+        Value *Cond = it->getOperand(0);
         bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
 
         // The condition can be loop invariant  but still defined inside the
@@ -1176,29 +1387,29 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         if (InvariantCond)
           Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
 
-        Value *Op0 = getVectorValue(Inst->getOperand(1));
-        Value *Op1 = getVectorValue(Inst->getOperand(2));
-        WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
+        Value *Op0 = getVectorValue(it->getOperand(1));
+        Value *Op1 = getVectorValue(it->getOperand(2));
+        WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1);
         break;
       }
 
       case Instruction::ICmp:
       case Instruction::FCmp: {
         // Widen compares. Generate vector compares.
-        bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
-        CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Value *B = getVectorValue(Inst->getOperand(1));
+        bool FCmp = (it->getOpcode() == Instruction::FCmp);
+        CmpInst *Cmp = dyn_cast<CmpInst>(it);
+        Value *A = getVectorValue(it->getOperand(0));
+        Value *B = getVectorValue(it->getOperand(1));
         if (FCmp)
-          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+          WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
         else
-          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+          WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
         break;
       }
 
       case Instruction::Store: {
         // Attempt to issue a wide store.
-        StoreInst *SI = dyn_cast<StoreInst>(Inst);
+        StoreInst *SI = dyn_cast<StoreInst>(it);
         Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
         Value *Ptr = SI->getPointerOperand();
         unsigned Alignment = SI->getAlignment();
@@ -1210,7 +1421,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
         // This store does not use GEPs.
         if (!Legal->isConsecutivePtr(Ptr)) {
-          scalarizeInstruction(Inst);
+          scalarizeInstruction(it);
           break;
         }
 
@@ -1237,7 +1448,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       }
       case Instruction::Load: {
         // Attempt to issue a wide load.
-        LoadInst *LI = dyn_cast<LoadInst>(Inst);
+        LoadInst *LI = dyn_cast<LoadInst>(it);
         Type *RetTy = VectorType::get(LI->getType(), VF);
         Value *Ptr = LI->getPointerOperand();
         unsigned Alignment = LI->getAlignment();
@@ -1247,7 +1458,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // scalarize the load.
         bool Con = Legal->isConsecutivePtr(Ptr);
         if (Legal->isUniform(Ptr) || !Con) {
-          scalarizeInstruction(Inst);
+          scalarizeInstruction(it);
           break;
         }
 
@@ -1272,7 +1483,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         LI = Builder.CreateLoad(Ptr);
         LI->setAlignment(Alignment);
         // Use this vector value for all users of the load.
-        WidenMap[Inst] = LI;
+        WidenMap[it] = LI;
         break;
       }
       case Instruction::ZExt:
@@ -1288,144 +1499,22 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       case Instruction::FPTrunc:
       case Instruction::BitCast: {
         /// Vectorize bitcasts.
-        CastInst *CI = dyn_cast<CastInst>(Inst);
-        Value *A = getVectorValue(Inst->getOperand(0));
+        CastInst *CI = dyn_cast<CastInst>(it);
+        Value *A = getVectorValue(it->getOperand(0));
         Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+        WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
         break;
       }
-
+        
       default:
         /// All other instructions are unsupported. Scalarize them.
-        scalarizeInstruction(Inst);
+        scalarizeInstruction(it);
         break;
     }// end of switch.
   }// end of for_each instr.
-
-  // At this point every instruction in the original loop is widended to
-  // a vector form. We are almost done. Now, we need to fix the PHI nodes
-  // that we vectorized. The PHI nodes are currently empty because we did
-  // not want to introduce cycles. Notice that the remaining PHI nodes
-  // that we need to fix are reduction variables.
-
-  // Create the 'reduced' values for each of the induction vars.
-  // The reduced values are the vector values that we scalarize and combine
-  // after the loop is finished.
-  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
-       it != e; ++it) {
-    PHINode *RdxPhi = *it;
-    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
-    assert(RdxPhi && "Unable to recover vectorized PHI");
-
-    // Find the reduction variable descriptor.
-    assert(Legal->getReductionVars()->count(RdxPhi) &&
-           "Unable to find the reduction variable");
-    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
-      (*Legal->getReductionVars())[RdxPhi];
-
-    // We need to generate a reduction vector from the incoming scalar.
-    // To do so, we need to generate the 'identity' vector and overide
-    // one of the elements with the incoming scalar reduction. We need
-    // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
-
-    // This is the vector-clone of the value that leaves the loop.
-    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
-    Type *VecTy = VectorExit->getType();
-
-    // Find the reduction identity variable. Zero for addition, or, xor,
-    // one for multiplication, -1 for And.
-    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
-                                          VecTy->getScalarType());
-
-    // This vector is the Identity vector where the first element is the
-    // incoming scalar reduction.
-    Value *VectorStart = Builder.CreateInsertElement(Identity,
-                                                    RdxDesc.StartValue, Zero);
-
-    // Fix the vector-loop phi.
-    // We created the induction variable so we know that the
-    // preheader is the first entry.
-    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
-
-    // Reductions do not have to start at zero. They can start with
-    // any loop invariant values.
-    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
-    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
-    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
-    VecRdxPhi->addIncoming(Val, LoopVectorBody);
-
-    // Before each round, move the insertion point right between
-    // the PHIs and the values we are going to write.
-    // This allows us to write both PHINodes and the extractelement
-    // instructions.
-    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
-
-    // This PHINode contains the vectorized reduction variable, or
-    // the initial value vector, if we bypass the vector loop.
-    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
-    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
-
-    // Extract the first scalar.
-    Value *Scalar0 =
-      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
-    // Extract and reduce the remaining vector elements.
-    for (unsigned i=1; i < VF; ++i) {
-      Value *Scalar1 =
-        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
-      switch (RdxDesc.Kind) {
-        case LoopVectorizationLegality::IntegerAdd:
-          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerMult:
-          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerOr:
-          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerAnd:
-          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerXor:
-          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
-          break;
-        default:
-          llvm_unreachable("Unknown reduction operation");
-      }
-    }
-
-    // Now, we need to fix the users of the reduction variable
-    // inside and outside of the scalar remainder loop.
-    // We know that the loop is in LCSSA form. We need to update the
-    // PHI nodes in the exit blocks.
-    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
-         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
-      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-      if (!LCSSAPhi) continue;
-
-      // All PHINodes need to have a single entry edge, or two if
-      // we already fixed them.
-      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
-
-      // We found our reduction value exit-PHI. Update it with the
-      // incoming bypass edge.
-      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
-        // Add an edge coming from the bypass.
-        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
-        break;
-      }
-    }// end of the LCSSA phi scan.
-
-    // Fix the scalar loop reduction variable with the incoming reduction sum
-    // from the vector body and from the backedge value.
-    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
-    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
-    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
-  }// end of for each redux variable.
 }
 
+
 void InnerLoopVectorizer::updateAnalysis() {
   // Forget the original basic block.
   SE->forgetLoop(OrigLoop);
-- 
cgit v1.1


From e6f2df961065e3b723ef0fc78441f07c5577acd2 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 4 Dec 2012 07:11:52 +0000
Subject: Give scalar if-converted blocks half the score because they are not
 always executed due to CF.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169223 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f538e08..1de5b30 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2159,17 +2159,17 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
     // For each instruction in the old loop.
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-
       unsigned C = getInstructionCost(it, VF);
       Cost += C;
       DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
             VF << " For instruction: "<< *it << "\n");
     }
 
-    // TODO: if-converted blocks can have a high-nest level. We need to
-    // calculate the loop nest level and multiply the cost accordingly.
-    if (Legal->blockNeedsPredication(*bb))
-      BlockCost *= 2;
+    // We assume that if-converted blocks have a 50% chance of being executed.
+    // When the code is scalar then some of the blocks are avoided due to CF.
+    // When the code is vectorized we execute all code paths.
+    if (Legal->blockNeedsPredication(*bb) && VF == 1)
+      BlockCost /= 2;
 
     Cost += BlockCost;
   }
-- 
cgit v1.1


From f6088d126e9110180f7db231bb58da9230a09537 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 4 Dec 2012 18:17:33 +0000
Subject: Add support for reduction variables when IF-conversion is enabled.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169288 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 33 +++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 10 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1de5b30..3502e9e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1133,8 +1133,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // Reductions do not have to start at zero. They can start with
     // any loop invariant values.
     VecRdxPhi->addIncoming(VectorStart, VecPreheader);
-    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
-    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
+    Value *Val =
+    getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
     VecRdxPhi->addIncoming(Val, LoopVectorBody);
 
     // Before each round, move the insertion point right between
@@ -1201,8 +1201,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Fix the scalar loop reduction variable with the incoming reduction sum
     // from the vector body and from the backedge value.
-    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
-    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+    int IncomingEdgeBlockIdx =
+    (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
+    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+    // Pick the other block.
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
@@ -1961,11 +1964,13 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
-  // Find the possible incoming reduction variable.
-  BasicBlock *BB = Phi->getParent();
-  int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
-  int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
-  Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
+  // Reduction variables are only found in the loop header block.
+  if (Phi->getParent() != TheLoop->getHeader())
+    return false;
+
+  // Obtain the reduction start value from the value that comes from the loop
+  // preheader.
+  Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
 
   // ExitInstruction is the single value which is used outside the loop.
   // We only allow for a single reduction value to be used outside the loop.
@@ -2003,9 +2008,17 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
         FoundStartPHI = true;
         continue;
       }
+
+      // We allow in-loop PHINodes which are not the original reduction PHI
+      // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
+      // structure) then don't skip this PHI.
+      if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() &&
+          TheLoop->contains(U->getParent()) && Iter->getNumUses() > 1)
+        continue;
+
       // Check if we found the exit user.
       BasicBlock *Parent = U->getParent();
-      if (Parent != BB) {
+      if (!TheLoop->contains(Parent)) {
         // We must have a single exit instruction.
         if (ExitInstruction != 0)
           return false;
-- 
cgit v1.1


From e570dee4b03cca54bbf27a7f7a3299c5cdc3d087 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 4 Dec 2012 22:40:22 +0000
Subject: Fix a bug in vectorization of if-converted reduction variables. If
 the reduction variable is not used outside the loop then we ran into an
 endless loop. This change checks if we found the original PHI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169324 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 34 ++++++++++++++++++------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3502e9e..ac62b11 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1985,20 +1985,20 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // Also, we can't have multiple block-local users.
   Instruction *Iter = Phi;
   while (true) {
+    // If the instruction has no users then this is a broken
+    // chain and can't be a reduction variable.
+    if (Iter->use_empty())
+      return false;
+
     // Any reduction instr must be of one of the allowed kinds.
     if (!isReductionInstr(Iter, Kind))
       return false;
 
-    // Did we found a user inside this block ?
+    // Did we find a user inside this block ?
     bool FoundInBlockUser = false;
     // Did we reach the initial PHI node ?
     bool FoundStartPHI = false;
 
-    // If the instruction has no users then this is a broken
-    // chain and can't be a reduction variable.
-    if (Iter->use_empty())
-      return false;
-
     // For each of the *users* of iter.
     for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
          it != e; ++it) {
@@ -2009,21 +2009,22 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
         continue;
       }
 
-      // We allow in-loop PHINodes which are not the original reduction PHI
-      // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
-      // structure) then don't skip this PHI.
-      if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() &&
-          TheLoop->contains(U->getParent()) && Iter->getNumUses() > 1)
-        continue;
-
       // Check if we found the exit user.
       BasicBlock *Parent = U->getParent();
       if (!TheLoop->contains(Parent)) {
-        // We must have a single exit instruction.
+        // Exit if you find multiple outside users.
         if (ExitInstruction != 0)
           return false;
         ExitInstruction = Iter;
       }
+
+      // We allow in-loop PHINodes which are not the original reduction PHI
+      // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
+      // structure) then don't skip this PHI.
+      if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() &&
+          TheLoop->contains(U) && Iter->getNumUses() > 1)
+        continue;
+
       // We can't have multiple inside users.
       if (FoundInBlockUser)
         return false;
@@ -2043,6 +2044,11 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
      Reductions[Phi] = RD;
      return true;
    }
+
+    // If we've reached the start PHI but did not find an outside user then
+    // this is dead code. Abort.
+    if (FoundStartPHI)
+      return false;
   }
 }
 
-- 
cgit v1.1


From dc5eff5b98aba0fe0e8104dfa7429c9a759684c2 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 4 Dec 2012 22:59:52 +0000
Subject: Enable if-conversion during vectorization.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169331 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ac62b11..20e073b 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -82,7 +82,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
           cl::desc("Set the default vectorization width. Zero is autoselect."));
 
 static cl::opt<bool>
-EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden,
+EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
 /// We don't vectorize loops with a known constant trip count below this number.
-- 
cgit v1.1


From 46c5f79789947b75ff3b0cc107f133fb0c5ffb5e Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 4 Dec 2012 23:25:24 +0000
Subject: LoopVectorizer: Increase the number of pointers that can be tested at
 runtime. If we cant prove statically that the pointers are disjoint then we
 add the runtime check.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169334 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 20e073b..166ad33 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -90,7 +90,7 @@ const unsigned TinyTripCountThreshold = 16;
 
 /// When performing a runtime memory check, do not check more than this
 /// number of pointers. Notice that the check is quadratic!
-const unsigned RuntimeMemoryCheckThreshold = 2;
+const unsigned RuntimeMemoryCheckThreshold = 4;
 
 /// This is the highest vector width that we try to generate.
 const unsigned MaxVectorSize = 8;
-- 
cgit v1.1


From b48fdbc811b0bf7dec0780b4d03169d8ee835d03 Mon Sep 17 00:00:00 2001
From: Paul Redmond <paul.redmond@intel.com>
Date: Sun, 9 Dec 2012 19:46:31 +0000
Subject: test commit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169709 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 166ad33..66ac0b4 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -139,7 +139,7 @@ private:
   typedef SmallVector<PHINode*, 4> PhiVector;
 
   /// Add code that checks at runtime if the accessed arrays overlap.
-  /// Returns the comperator value or NULL if no check is needed.
+  /// Returns the comparator value or NULL if no check is needed.
   Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
                          Instruction *Loc);
   /// Create an empty loop, based on the loop ranges of the old loop.
-- 
cgit v1.1


From 880166684e5af0f5b4bfe26870b9f7813e537354 Mon Sep 17 00:00:00 2001
From: Paul Redmond <paul.redmond@intel.com>
Date: Sun, 9 Dec 2012 20:42:17 +0000
Subject: LoopVectorize: support vectorizing intrinsic calls

- added function to VectorTargetTransformInfo to query cost of intrinsics
- vectorize trivially vectorizable intrinsic calls such as sin, cos, log, etc.

Reviewed by: Nadav


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169711 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 57 ++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 66ac0b4..c93c2bf 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -62,6 +62,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
@@ -1051,6 +1052,35 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
   }
 }
 
+static bool
+isTriviallyVectorizableIntrinsic(Instruction *Inst) {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::sqrt:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::exp:
+  case Intrinsic::exp2:
+  case Intrinsic::log:
+  case Intrinsic::log10:
+  case Intrinsic::log2:
+  case Intrinsic::fabs:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+  case Intrinsic::pow:
+  case Intrinsic::fma:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
 void
 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -1509,8 +1539,22 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         break;
       }
         
+      case Instruction::Call: {
+        assert(isTriviallyVectorizableIntrinsic(it));
+        Module *M = BB->getParent()->getParent();
+        IntrinsicInst *II = cast<IntrinsicInst>(it);
+        Intrinsic::ID ID = II->getIntrinsicID();
+        SmallVector<Value*, 4> Args;
+        for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 
+          Args.push_back(getVectorValue(II->getArgOperand(i)));
+        Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
+        Function *F = Intrinsic::getDeclaration(M, ID, Tys);
+        WidenMap[it] = Builder.CreateCall(F, Args);
+        break;
+      }
+
       default:
-        /// All other instructions are unsupported. Scalarize them.
+        // All other instructions are unsupported. Scalarize them.
         scalarizeInstruction(it);
         break;
     }// end of switch.
@@ -1706,7 +1750,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // We still don't handle functions.
       CallInst *CI = dyn_cast<CallInst>(it);
-      if (CI) {
+      if (CI && !isTriviallyVectorizableIntrinsic(it)) {
         DEBUG(dbgs() << "LV: Found a call site.\n");
         return false;
       }
@@ -2326,6 +2370,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
       return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
     }
+    case Instruction::Call: {
+      assert(isTriviallyVectorizableIntrinsic(I));
+      IntrinsicInst *II = cast<IntrinsicInst>(I);
+      Type *RetTy = ToVectorTy(II->getType(), VF);
+      SmallVector<Type*, 4> Tys;
+      for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 
+        Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
+      return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
+    }
     default: {
       // We are scalarizing the instruction. Return the cost of the scalar
       // instruction, plus the cost of insert and extract into vector
-- 
cgit v1.1


From f0d19bd1291ee1d2ffee4bbe0aef12b814aff789 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 10 Dec 2012 19:25:06 +0000
Subject: Add support for reverse induction variables. For example:

while (i--)
 sum+=A[i];



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169752 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 281 ++++++++++++++++++++---------
 1 file changed, 191 insertions(+), 90 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c93c2bf..593fb79 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -175,9 +175,9 @@ private:
   /// element.
   Value *getBroadcastInstrs(Value *V);
 
-  /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
-  /// for each element in the vector. Starting from zero.
-  Value *getConsecutiveVector(Value* Val);
+  /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
+  /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
+  Value *getConsecutiveVector(Value* Val, bool Negate = false);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
@@ -252,7 +252,7 @@ public:
                               DominatorTree *Dt):
   TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { }
 
-  /// This represents the kinds of reductions that we support.
+  /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
     NoReduction, /// Not a reduction.
     IntegerAdd,  /// Sum of numbers.
@@ -262,6 +262,14 @@ public:
     IntegerXor   /// Bitwise or logical XOR of numbers.
   };
 
+  /// This enum represents the kinds of inductions that we support.
+  enum InductionKind {
+    NoInduction,         /// Not an induction variable.
+    IntInduction,        /// Integer induction variable. Step = 1.
+    ReverseIntInduction, /// Reverse int induction variable. Step = -1.
+    PtrInduction         /// Pointer induction variable. Step = sizeof(elem).
+  };
+
   /// This POD struct holds information about reduction variables.
   struct ReductionDescriptor {
     // Default C'tor
@@ -316,13 +324,25 @@ public:
     SmallVector<const SCEV*, 2> Ends;
   };
 
+  /// A POD for saving information about induction variables.
+  struct InductionInfo {
+    /// Ctors.
+    InductionInfo(Value *Start, InductionKind K):
+      StartValue(Start), IK(K) {};
+    InductionInfo(): StartValue(0), IK(NoInduction) {};
+    /// Start value.
+    Value *StartValue;
+    /// Induction kind.
+    InductionKind IK;
+  };
+
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
-  /// InductionList saves induction variables and maps them to the initial
-  /// value entring the loop.
-  typedef DenseMap<PHINode*, Value*> InductionList;
+  /// InductionList saves induction variables and maps them to the
+  /// induction descriptor.
+  typedef DenseMap<PHINode*, InductionInfo> InductionList;
 
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
@@ -385,8 +405,9 @@ private:
   /// Returns true if the instruction I can be a reduction variable of type
   /// 'Kind'.
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
-  /// Returns True, if 'Phi' is an induction variable.
-  bool isInductionVariable(PHINode *Phi);
+  /// Returns the induction kind of Phi. This function may return NoInduction
+  /// if the PHI is not an induction variable.
+  InductionKind isInductionVariable(PHINode *Phi);
   /// Return true if can compute the address bounds of Ptr within the loop.
   bool hasComputableBounds(Value *Ptr);
 
@@ -558,7 +579,9 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   Instruction *Loc = Builder.GetInsertPoint();
 
   // We need to place the broadcast of invariant variables outside the loop.
-  bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction);
+  Instruction *Instr = dyn_cast<Instruction>(V);
+  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
+  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
 
   // Place the code for broadcasting invariant variables in the new preheader.
   if (Invariant)
@@ -580,19 +603,19 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val) {
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
   assert(Val->getType()->getScalarType()->isIntegerTy() &&
          "Elem must be an integer");
   // Create the types.
   Type *ITy = Val->getType()->getScalarType();
   VectorType *Ty = cast<VectorType>(Val->getType());
-  unsigned VLen = Ty->getNumElements();
+  int VLen = Ty->getNumElements();
   SmallVector<Constant*, 8> Indices;
 
   // Create a vector of consecutive numbers from zero to VF.
-  for (unsigned i = 0; i < VLen; ++i)
-    Indices.push_back(ConstantInt::get(ITy, i));
+  for (int i = 0; i < VLen; ++i)
+    Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i ));
 
   // Add the consecutive indices to the vector value.
   Constant *Cv = ConstantVector::get(Indices);
@@ -603,10 +626,13 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val) {
 bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
 
-  // If this pointer is an induction variable, return it.
+  // If this value is a pointer induction variable we know it is consecutive.
   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
-  if (Phi && getInductionVars()->count(Phi))
-    return true;
+  if (Phi && Inductions.count(Phi)) {
+    InductionInfo II = Inductions[Phi];
+    if (PtrInduction == II.IK)
+      return true;
+  }
 
   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
   if (!Gep)
@@ -730,7 +756,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
 Value*
 InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
-                                           Instruction *Loc) {
+                                     Instruction *Loc) {
   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
     Legal->getRuntimePointerCheck();
 
@@ -745,7 +771,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   SCEVExpander Exp(*SE, "induction");
 
   // Use this type for pointer arithmetic.
-  Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
 
   for (unsigned i = 0; i < NumPointers; ++i) {
     Value *Ptr = PtrRtCheck->Pointers[i];
@@ -759,8 +785,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
     } else {
       DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
 
-      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i],
-                                       PtrArithTy, Loc);
+      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
       Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
       Starts.push_back(Start);
       Ends.push_back(End);
@@ -769,10 +794,16 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
 
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i+1; j < NumPointers; ++j) {
+      Instruction::CastOps Op = Instruction::BitCast;
+      Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc);
+      Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc);
+      Value *End0 =   CastInst::Create(Op, Ends[i],   PtrArithTy, "bc", Loc);
+      Value *End1 =   CastInst::Create(Op, Ends[j],   PtrArithTy, "bc", Loc);
+
       Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                    Starts[i], Ends[j], "bound0", Loc);
+                                    Start0, End1, "bound0", Loc);
       Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                    Starts[j], Ends[i], "bound1", Loc);
+                                    Start1, End0, "bound1", Loc);
       Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
                                                  "found.conflict", Loc);
       if (MemoryRuntimeCheck)
@@ -936,27 +967,54 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
   for (I = List->begin(), E = List->end(); I != E; ++I) {
     PHINode *OrigPhi = I->first;
+    LoopVectorizationLegality::InductionInfo II = I->second;
     PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
                                            MiddleBlock->getTerminator());
     Value *EndValue = 0;
-    if (OrigPhi->getType()->isIntegerTy()) {
+    switch (II.IK) {
+    case LoopVectorizationLegality::NoInduction:
+      llvm_unreachable("Unknown induction");
+    case LoopVectorizationLegality::IntInduction: {
       // Handle the integer induction counter:
+      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
       assert(OrigPhi == OldInduction && "Unknown integer PHI");
       // We know what the end value is.
       EndValue = IdxEndRoundDown;
       // We also know which PHI node holds it.
       ResumeIndex = ResumeVal;
-    } else {
+      break;
+    }
+    case LoopVectorizationLegality::ReverseIntInduction: {
+      // Convert the CountRoundDown variable to the PHI size.
+      unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
+      unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
+      Value *CRD = CountRoundDown;
+      if (CRDSize > IISize)
+        CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
+                               II.StartValue->getType(),
+                               "tr.crd", BypassBlock->getTerminator());
+      else if (CRDSize < IISize)
+        CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
+                               II.StartValue->getType(),
+                               "sext.crd", BypassBlock->getTerminator());
+      // Handle reverse integer induction counter:
+      EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
+                                           BypassBlock->getTerminator());
+      break;
+    }
+    case LoopVectorizationLegality::PtrInduction: {
       // For pointer induction variables, calculate the offset using
       // the end index.
-      EndValue = GetElementPtrInst::Create(I->second, CountRoundDown,
+      EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown,
                                            "ptr.ind.end",
                                            BypassBlock->getTerminator());
+      break;
     }
+    }// end of case
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    ResumeVal->addIncoming(I->second, BypassBlock);
+    ResumeVal->addIncoming(II.StartValue, BypassBlock);
     ResumeVal->addIncoming(EndValue, VecBody);
 
     // Fix the scalar body counter (PHI node).
@@ -1188,19 +1246,19 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
       switch (RdxDesc.Kind) {
         case LoopVectorizationLegality::IntegerAdd:
-          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
+          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx");
           break;
         case LoopVectorizationLegality::IntegerMult:
-          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+          Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx");
           break;
         case LoopVectorizationLegality::IntegerOr:
-          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
+          Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx");
           break;
         case LoopVectorizationLegality::IntegerAnd:
-          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
+          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx");
           break;
         case LoopVectorizationLegality::IntegerXor:
-          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
+          Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx");
           break;
         default:
           llvm_unreachable("Unknown reduction operation");
@@ -1282,7 +1340,7 @@ Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
 
 void
 InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
-                                            BasicBlock *BB, PhiVector *PV) {
+                                          BasicBlock *BB, PhiVector *PV) {
   Constant *Zero =
   ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0);
 
@@ -1329,45 +1387,77 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         assert(Legal->getInductionVars()->count(P) &&
                "Not an induction variable");
 
-        if (P->getType()->isIntegerTy()) {
+        LoopVectorizationLegality::InductionInfo II =
+          Legal->getInductionVars()->lookup(P);
+
+        switch (II.IK) {
+        case LoopVectorizationLegality::NoInduction:
+          llvm_unreachable("Unknown induction");
+        case LoopVectorizationLegality::IntInduction: {
           assert(P == OldInduction && "Unexpected PHI");
           Value *Broadcasted = getBroadcastInstrs(Induction);
           // After broadcasting the induction variable we need to make the
           // vector consecutive by adding 0, 1, 2 ...
           Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
-
           WidenMap[OldInduction] = ConsecutiveInduction;
           continue;
         }
+        case LoopVectorizationLegality::ReverseIntInduction:
+        case LoopVectorizationLegality::PtrInduction:
+          // Handle reverse integer and pointer inductions.
+          Value *StartIdx = 0;
+          // If we have a single integer induction variable then use it.
+          // Otherwise, start counting at zero.
+          if (OldInduction) {
+            LoopVectorizationLegality::InductionInfo OldII =
+              Legal->getInductionVars()->lookup(OldInduction);
+            StartIdx = OldII.StartValue;
+          } else {
+            StartIdx = ConstantInt::get(Induction->getType(), 0);
+          }
+          // This is the normalized GEP that starts counting at zero.
+          Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                   "normalized.idx");
+
+          // Handle the reverse integer induction variable case.
+          if (LoopVectorizationLegality::ReverseIntInduction == II.IK) {
+            IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
+            Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
+                                                   "resize.norm.idx");
+            Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
+                                                   "reverse.idx");
+
+            // This is a new value so do not hoist it out.
+            Value *Broadcasted = getBroadcastInstrs(ReverseInd);
+            // After broadcasting the induction variable we need to make the
+            // vector consecutive by adding  ... -3, -2, -1, 0.
+            Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted,
+                                                               true);
+            WidenMap[it] = ConsecutiveInduction;
+            continue;
+          }
 
-        // Handle pointer inductions.
-        assert(P->getType()->isPointerTy() && "Unexpected type.");
-        Value *StartIdx = OldInduction ?
-        Legal->getInductionVars()->lookup(OldInduction) :
-        ConstantInt::get(Induction->getType(), 0);
-
-        // This is the pointer value coming into the loop.
-        Value *StartPtr = Legal->getInductionVars()->lookup(P);
-
-        // This is the normalized GEP that starts counting at zero.
-        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
-                                                 "normalized.idx");
-
-        // This is the vector of results. Notice that we don't generate vector
-        // geps because scalar geps result in better code.
-        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
-        for (unsigned int i = 0; i < VF; ++i) {
-          Constant *Idx = ConstantInt::get(Induction->getType(), i);
-          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
-          Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep");
-          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
-                                               Builder.getInt32(i),
-                                               "insert.gep");
+          // Handle the pointer induction variable case.
+          assert(P->getType()->isPointerTy() && "Unexpected type.");
+
+          // This is the vector of results. Notice that we don't generate vector
+          // geps because scalar geps result in better code.
+          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+          for (unsigned int i = 0; i < VF; ++i) {
+            Constant *Idx = ConstantInt::get(Induction->getType(), i);
+            Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, "next.gep");
+            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                                 Builder.getInt32(i),
+                                                 "insert.gep");
+          }
+
+          WidenMap[it] = VecVal;
+          continue;
         }
 
-        WidenMap[it] = VecVal;
-        continue;
-      }
+      }// End of PHI.
+
       case Instruction::Add:
       case Instruction::FAdd:
       case Instruction::Sub:
@@ -1561,7 +1651,6 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
   }// end of for_each instr.
 }
 
-
 void InnerLoopVectorizer::updateAnalysis() {
   // Forget the original basic block.
   SE->forgetLoop(OrigLoop);
@@ -1580,7 +1669,6 @@ void InnerLoopVectorizer::updateAnalysis() {
   DEBUG(DT->verifyAnalysis());
 }
 
-
 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   if (!EnableIfConversion)
     return false;
@@ -1694,35 +1782,39 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           return false;
         }
 
+        // Check that this PHI type is allowed.
+        if (!Phi->getType()->isIntegerTy() &&
+            !Phi->getType()->isPointerTy()) {
+          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
+          return false;
+        }
+
         // If this PHINode is not in the header block, then we know that we
-        // can convert it to select during if-conversion.
+        // can convert it to select during if-conversion. No need to check if
+        // the PHIs in this block are induction or reduction variables.
         if (*bb != Header)
           continue;
 
         // This is the value coming from the preheader.
         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+        // Check if this is an induction variable.
+        InductionKind IK = isInductionVariable(Phi);
+
+        if (NoInduction != IK) {
+          // Int inductions are special because we only allow one IV.
+          if (IK == IntInduction) {
+            if (Induction) {
+              DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+              return false;
+            }
+            Induction = Phi;
+          }
 
-        // We only look at integer and pointer phi nodes.
-        if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
-          DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
-          Inductions[Phi] = StartValue;
+          DEBUG(dbgs() << "LV: Found an induction variable.\n");
+          Inductions[Phi] = InductionInfo(StartValue, IK);
           continue;
-        } else if (!Phi->getType()->isIntegerTy()) {
-          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
-          return false;
         }
 
-        // Handle integer PHIs:
-        if (isInductionVariable(Phi)) {
-          if (Induction) {
-            DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
-            return false;
-          }
-          DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
-          Induction = Phi;
-          Inductions[Phi] = StartValue;
-          continue;
-        }
         if (AddReductionVar(Phi, IntegerAdd)) {
           DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
           continue;
@@ -2119,32 +2211,42 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     }
 }
 
-bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+LoopVectorizationLegality::InductionKind
+LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   Type *PhiTy = Phi->getType();
   // We only handle integer and pointer inductions variables.
   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
-    return false;
+    return NoInduction;
 
   // Check that the PHI is consecutive and starts at zero.
   const SCEV *PhiScev = SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
   if (!AR) {
     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
-    return false;
+    return NoInduction;
   }
   const SCEV *Step = AR->getStepRecurrence(*SE);
 
   // Integer inductions need to have a stride of one.
-  if (PhiTy->isIntegerTy())
-    return Step->isOne();
+  if (PhiTy->isIntegerTy()) {
+    if (Step->isOne())
+      return IntInduction;
+    if (Step->isAllOnesValue())
+      return ReverseIntInduction;
+    return NoInduction;
+  }
 
   // Calculate the pointer stride and check if it is consecutive.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
-  if (!C) return false;
+  if (!C)
+    return NoInduction;
 
   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
   uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
-  return (C->getValue()->equalsInt(Size));
+  if (C->getValue()->equalsInt(Size))
+    return PtrInduction;
+
+  return NoInduction;
 }
 
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
@@ -2252,7 +2354,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   Type *RetTy = I->getType();
   Type *VectorTy = ToVectorTy(RetTy, VF);
 
-
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
     case Instruction::GetElementPtr:
-- 
cgit v1.1


From d1d92bf953d51560e979337cadcc9d7e62fdd79e Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 10 Dec 2012 21:39:02 +0000
Subject: Split the LoopVectorizer into H and CPP.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169771 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 1486 ++++++++++------------------
 1 file changed, 535 insertions(+), 951 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 593fb79..feeecec 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6,45 +6,7 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
-// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizes uses (will use) the codegen
-// interfaces to generate IR that is likely to result in an optimal binary.
-//
-// The loop vectorizer combines consecutive loop iteration into a single
-// 'wide' iteration. After this transformation the index is incremented
-// by the SIMD vector width, and not by one.
-//
-// This pass has three parts:
-// 1. The main loop pass that drives the different parts.
-// 2. LoopVectorizationLegality - A unit that checks for the legality
-//    of the vectorization.
-// 3. InnerLoopVectorizer - A unit that performs the actual
-//    widening of instructions.
-// 4. LoopVectorizationCostModel - A unit that checks for the profitability
-//    of vectorization. It decides on the optimal vector width, which
-//    can be one, if vectorization is not profitable.
-//
-//===----------------------------------------------------------------------===//
-//
-// The reduction-variable vectorization is based on the paper:
-//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
-//
-// Variable uniformity checks are inspired by:
-// Karrenberg, R. and Hack, S. Whole Function Vectorization.
-//
-// Other ideas/concepts are from:
-//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
-//
-//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
-//  Vectorizing Compilers.
-//
-//===----------------------------------------------------------------------===//
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm/ADT/SmallVector.h"
+#include "LoopVectorize.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
@@ -52,7 +14,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -73,423 +35,21 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm/Type.h"
 #include "llvm/Value.h"
-#include <algorithm>
-using namespace llvm;
 
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
-          cl::desc("Set the default vectorization width. Zero is autoselect."));
+                    cl::desc("Sets the SIMD width. Zero is autoselect."));
 
 static cl::opt<bool>
 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
-/// We don't vectorize loops with a known constant trip count below this number.
-const unsigned TinyTripCountThreshold = 16;
-
-/// When performing a runtime memory check, do not check more than this
-/// number of pointers. Notice that the check is quadratic!
-const unsigned RuntimeMemoryCheckThreshold = 4;
-
-/// This is the highest vector width that we try to generate.
-const unsigned MaxVectorSize = 8;
-
 namespace {
 
-// Forward declarations.
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-
-/// InnerLoopVectorizer vectorizes loops which contain only one basic
-/// block to a specified vectorization factor (VF).
-/// This class performs the widening of scalars into vectors, or multiple
-/// scalars. This class also implements the following features:
-/// * It inserts an epilogue loop for handling loops that don't have iteration
-///   counts that are known to be a multiple of the vectorization factor.
-/// * It handles the code generation for reduction variables.
-/// * Scalarization (implementation using scalars) of un-vectorizable
-///   instructions.
-/// InnerLoopVectorizer does not perform any vectorization-legality
-/// checks, and relies on the caller to check for the different legality
-/// aspects. The InnerLoopVectorizer relies on the
-/// LoopVectorizationLegality class to provide information about the induction
-/// and reduction variables that were found to a given vectorization factor.
-class InnerLoopVectorizer {
-public:
-  /// Ctor.
-  InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                      DominatorTree *Dt, DataLayout *Dl, unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
-  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
-
-  // Perform the actual loop widening (vectorization).
-  void vectorize(LoopVectorizationLegality *Legal) {
-    // Create a new empty loop. Unlink the old loop and connect the new one.
-    createEmptyLoop(Legal);
-    // Widen each instruction in the old loop to a new one in the new loop.
-    // Use the Legality module to find the induction and reduction variables.
-    vectorizeLoop(Legal);
-    // Register the new loop and update the analysis passes.
-    updateAnalysis();
- }
-
-private:
-  /// A small list of PHINodes.
-  typedef SmallVector<PHINode*, 4> PhiVector;
-
-  /// Add code that checks at runtime if the accessed arrays overlap.
-  /// Returns the comparator value or NULL if no check is needed.
-  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
-                         Instruction *Loc);
-  /// Create an empty loop, based on the loop ranges of the old loop.
-  void createEmptyLoop(LoopVectorizationLegality *Legal);
-  /// Copy and widen the instructions from the old loop.
-  void vectorizeLoop(LoopVectorizationLegality *Legal);
-
-  /// A helper function that computes the predicate of the block BB, assuming
-  /// that the header block of the loop is set to True. It returns the *entry*
-  /// mask for the block BB.
-  Value *createBlockInMask(BasicBlock *BB);
-  /// A helper function that computes the predicate of the edge between SRC
-  /// and DST.
-  Value *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
-
-  /// A helper function to vectorize a single BB within the innermost loop.
-  void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
-                            PhiVector *PV);
-
-  /// Insert the new loop to the loop hierarchy and pass manager
-  /// and update the analysis passes.
-  void updateAnalysis();
-
-  /// This instruction is un-vectorizable. Implement it as a sequence
-  /// of scalars.
-  void scalarizeInstruction(Instruction *Instr);
-
-  /// Create a broadcast instruction. This method generates a broadcast
-  /// instruction (shuffle) for loop invariant values and for the induction
-  /// value. If this is the induction variable then we extend it to N, N+1, ...
-  /// this is needed because each iteration in the loop corresponds to a SIMD
-  /// element.
-  Value *getBroadcastInstrs(Value *V);
-
-  /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
-  /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
-  Value *getConsecutiveVector(Value* Val, bool Negate = false);
-
-  /// When we go over instructions in the basic block we rely on previous
-  /// values within the current basic block or on loop invariant values.
-  /// When we widen (vectorize) values we place them in the map. If the values
-  /// are not within the map, they have to be loop invariant, so we simply
-  /// broadcast them into a vector.
-  Value *getVectorValue(Value *V);
-
-  /// Get a uniform vector of constant integers. We use this to get
-  /// vectors of ones and zeros for the reduction code.
-  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
-
-  typedef DenseMap<Value*, Value*> ValueMap;
-
-  /// The original loop.
-  Loop *OrigLoop;
-  // Scev analysis to use.
-  ScalarEvolution *SE;
-  // Loop Info.
-  LoopInfo *LI;
-  // Dominator Tree.
-  DominatorTree *DT;
-  // Data Layout.
-  DataLayout *DL;
-  // The vectorization factor to use.
-  unsigned VF;
-
-  // The builder that we use
-  IRBuilder<> Builder;
-
-  // --- Vectorization state ---
-
-  /// The vector-loop preheader.
-  BasicBlock *LoopVectorPreHeader;
-  /// The scalar-loop preheader.
-  BasicBlock *LoopScalarPreHeader;
-  /// Middle Block between the vector and the scalar.
-  BasicBlock *LoopMiddleBlock;
-  ///The ExitBlock of the scalar loop.
-  BasicBlock *LoopExitBlock;
-  ///The vector loop body.
-  BasicBlock *LoopVectorBody;
-  ///The scalar loop body.
-  BasicBlock *LoopScalarBody;
-  ///The first bypass block.
-  BasicBlock *LoopBypassBlock;
-
-  /// The new Induction variable which was added to the new block.
-  PHINode *Induction;
-  /// The induction variable of the old basic block.
-  PHINode *OldInduction;
-  // Maps scalars to widened vectors.
-  ValueMap WidenMap;
-};
-
-/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
-/// to what vectorization factor.
-/// This class does not look at the profitability of vectorization, only the
-/// legality. This class has two main kinds of checks:
-/// * Memory checks - The code in canVectorizeMemory checks if vectorization
-///   will change the order of memory accesses in a way that will change the
-///   correctness of the program.
-/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
-/// checks for a number of different conditions, such as the availability of a
-/// single induction variable, that all types are supported and vectorize-able,
-/// etc. This code reflects the capabilities of InnerLoopVectorizer.
-/// This class is also used by InnerLoopVectorizer for identifying
-/// induction variable and the different reduction variables.
-class LoopVectorizationLegality {
-public:
-  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
-                              DominatorTree *Dt):
-  TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { }
-
-  /// This enum represents the kinds of reductions that we support.
-  enum ReductionKind {
-    NoReduction, /// Not a reduction.
-    IntegerAdd,  /// Sum of numbers.
-    IntegerMult, /// Product of numbers.
-    IntegerOr,   /// Bitwise or logical OR of numbers.
-    IntegerAnd,  /// Bitwise or logical AND of numbers.
-    IntegerXor   /// Bitwise or logical XOR of numbers.
-  };
-
-  /// This enum represents the kinds of inductions that we support.
-  enum InductionKind {
-    NoInduction,         /// Not an induction variable.
-    IntInduction,        /// Integer induction variable. Step = 1.
-    ReverseIntInduction, /// Reverse int induction variable. Step = -1.
-    PtrInduction         /// Pointer induction variable. Step = sizeof(elem).
-  };
-
-  /// This POD struct holds information about reduction variables.
-  struct ReductionDescriptor {
-    // Default C'tor
-    ReductionDescriptor():
-    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
-
-    // C'tor.
-    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
-    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
-
-    // The starting value of the reduction.
-    // It does not have to be zero!
-    Value *StartValue;
-    // The instruction who's value is used outside the loop.
-    Instruction *LoopExitInstr;
-    // The kind of the reduction.
-    ReductionKind Kind;
-  };
-
-  // This POD struct holds information about the memory runtime legality
-  // check that a group of pointers do not overlap.
-  struct RuntimePointerCheck {
-    RuntimePointerCheck(): Need(false) {}
-
-    /// Reset the state of the pointer runtime information.
-    void reset() {
-      Need = false;
-      Pointers.clear();
-      Starts.clear();
-      Ends.clear();
-    }
-
-    /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) {
-      const SCEV *Sc = SE->getSCEV(Ptr);
-      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
-      assert(AR && "Invalid addrec expression");
-      const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
-      const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
-      Pointers.push_back(Ptr);
-      Starts.push_back(AR->getStart());
-      Ends.push_back(ScEnd);
-    }
-
-    /// This flag indicates if we need to add the runtime check.
-    bool Need;
-    /// Holds the pointers that we need to check.
-    SmallVector<Value*, 2> Pointers;
-    /// Holds the pointer value at the beginning of the loop.
-    SmallVector<const SCEV*, 2> Starts;
-    /// Holds the pointer value at the end of the loop.
-    SmallVector<const SCEV*, 2> Ends;
-  };
-
-  /// A POD for saving information about induction variables.
-  struct InductionInfo {
-    /// Ctors.
-    InductionInfo(Value *Start, InductionKind K):
-      StartValue(Start), IK(K) {};
-    InductionInfo(): StartValue(0), IK(NoInduction) {};
-    /// Start value.
-    Value *StartValue;
-    /// Induction kind.
-    InductionKind IK;
-  };
-
-  /// ReductionList contains the reduction descriptors for all
-  /// of the reductions that were found in the loop.
-  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
-
-  /// InductionList saves induction variables and maps them to the
-  /// induction descriptor.
-  typedef DenseMap<PHINode*, InductionInfo> InductionList;
-
-  /// Returns true if it is legal to vectorize this loop.
-  /// This does not mean that it is profitable to vectorize this
-  /// loop, only that it is legal to do so.
-  bool canVectorize();
-
-  /// Returns the Induction variable.
-  PHINode *getInduction() {return Induction;}
-
-  /// Returns the reduction variables found in the loop.
-  ReductionList *getReductionVars() { return &Reductions; }
-
-  /// Returns the induction variables found in the loop.
-  InductionList *getInductionVars() { return &Inductions; }
-
-  /// Return true if the block BB needs to be predicated in order for the loop
-  /// to be vectorized.
-  bool blockNeedsPredication(BasicBlock *BB);
-
-  /// Check if this  pointer is consecutive when vectorizing. This happens
-  /// when the last index of the GEP is the induction variable, or that the
-  /// pointer itself is an induction variable.
-  /// This check allows us to vectorize A[idx] into a wide load/store.
-  bool isConsecutivePtr(Value *Ptr);
-
-  /// Returns true if the value V is uniform within the loop.
-  bool isUniform(Value *V);
-
-  /// Returns true if this instruction will remain scalar after vectorization.
-  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
-
-  /// Returns the information that we collected about runtime memory check.
-  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
-private:
-  /// Check if a single basic block loop is vectorizable.
-  /// At this point we know that this is a loop with a constant trip count
-  /// and we only need to check individual instructions.
-  bool canVectorizeInstrs();
-
-  /// When we vectorize loops we may change the order in which
-  /// we read and write from memory. This method checks if it is
-  /// legal to vectorize the code, considering only memory constrains.
-  /// Returns true if the loop is vectorizable
-  bool canVectorizeMemory();
-
-  /// Return true if we can vectorize this loop using the IF-conversion
-  /// transformation.
-  bool canVectorizeWithIfConvert();
-
-  /// Collect the variables that need to stay uniform after vectorization.
-  void collectLoopUniforms();
-
-  /// Return true if all of the instructions in the block can be speculatively
-  /// executed.
-  bool blockCanBePredicated(BasicBlock *BB);
-
-  /// Returns True, if 'Phi' is the kind of reduction variable for type
-  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
-  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
-  /// Returns true if the instruction I can be a reduction variable of type
-  /// 'Kind'.
-  bool isReductionInstr(Instruction *I, ReductionKind Kind);
-  /// Returns the induction kind of Phi. This function may return NoInduction
-  /// if the PHI is not an induction variable.
-  InductionKind isInductionVariable(PHINode *Phi);
-  /// Return true if can compute the address bounds of Ptr within the loop.
-  bool hasComputableBounds(Value *Ptr);
-
-  /// The loop that we evaluate.
-  Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
-  /// DataLayout analysis.
-  DataLayout *DL;
-  // Dominators.
-  DominatorTree *DT;
-
-  //  ---  vectorization state --- //
-
-  /// Holds the integer induction variable. This is the counter of the
-  /// loop.
-  PHINode *Induction;
-  /// Holds the reduction variables.
-  ReductionList Reductions;
-  /// Holds all of the induction variables that we found in the loop.
-  /// Notice that inductions don't need to start at zero and that induction
-  /// variables can be pointers.
-  InductionList Inductions;
-
-  /// Allowed outside users. This holds the reduction
-  /// vars which can be accessed from outside the loop.
-  SmallPtrSet<Value*, 4> AllowedExit;
-  /// This set holds the variables which are known to be uniform after
-  /// vectorization.
-  SmallPtrSet<Instruction*, 4> Uniforms;
-  /// We need to check that all of the pointers in this list are disjoint
-  /// at runtime.
-  RuntimePointerCheck PtrRtCheck;
-};
-
-/// LoopVectorizationCostModel - estimates the expected speedups due to
-/// vectorization.
-/// In many cases vectorization is not profitable. This can happen because
-/// of a number of reasons. In this class we mainly attempt to predict
-/// the expected speedup/slowdowns due to the supported instruction set.
-/// We use the VectorTargetTransformInfo to query the different backends
-/// for the cost of different operations.
-class LoopVectorizationCostModel {
-public:
-  /// C'tor.
-  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
-                             LoopVectorizationLegality *Leg,
-                             const VectorTargetTransformInfo *Vtti):
-  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
-
-  /// Returns the most profitable vectorization factor for the loop that is
-  /// smaller or equal to the VF argument. This method checks every power
-  /// of two up to VF.
-  unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize);
-
-private:
-  /// Returns the expected execution cost. The unit of the cost does
-  /// not matter because we use the 'cost' units to compare different
-  /// vector widths. The cost that is returned is *not* normalized by
-  /// the factor width.
-  unsigned expectedCost(unsigned VF);
-
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  unsigned getInstructionCost(Instruction *I, unsigned VF);
-
-  /// A helper function for converting Scalar types to vector types.
-  /// If the incoming type is void, we return void. If the VF is 1, we return
-  /// the scalar type.
-  static Type* ToVectorTy(Type *Scalar, unsigned VF);
-
-  /// The loop that we evaluate.
-  Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
-
-  /// Vectorization legality.
-  LoopVectorizationLegality *Legal;
-  /// Vector target information.
-  const VectorTargetTransformInfo *VTTI;
-};
-
+/// The LoopVectorize Pass.
 struct LoopVectorize : public LoopPass {
   static char ID; // Pass identification, replacement for typeid
 
@@ -569,6 +129,26 @@ struct LoopVectorize : public LoopPass {
 
 };
 
+}// namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel.
+//===----------------------------------------------------------------------===//
+
+void
+LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
+                                                       Loop *Lp, Value *Ptr) {
+  const SCEV *Sc = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+  assert(AR && "Invalid addrec expression");
+  const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
+  const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+  Pointers.push_back(Ptr);
+  Starts.push_back(AR->getStart());
+  Ends.push_back(ScEnd);
+}
+
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   // Create the types.
   LLVMContext &C = V->getContext();
@@ -594,7 +174,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
   // Broadcast the scalar into all locations in the vector.
   Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
-                                             "broadcast");
+                                            "broadcast");
 
   // Restore the builder insertion point.
   if (Invariant)
@@ -758,7 +338,7 @@ Value*
 InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
                                      Instruction *Loc) {
   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
-    Legal->getRuntimePointerCheck();
+  Legal->getRuntimePointerCheck();
 
   if (!PtrRtCheck->Need)
     return NULL;
@@ -827,26 +407,26 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-    [ ] <-- vector loop bypass.
-  /  |
- /   v
-|   [ ]     <-- vector pre header.
-|    |
-|    v
-|   [  ] \
-|   [  ]_|   <-- vector loop.
-|    |
- \   v
+   [ ] <-- vector loop bypass.
+   /  |
+   /   v
+   |   [ ]     <-- vector pre header.
+   |    |
+   |    v
+   |   [  ] \
+   |   [  ]_|   <-- vector loop.
+   |    |
+   \   v
    >[ ]   <--- middle-block.
-  /  |
- /   v
-|   [ ]     <--- new preheader.
-|    |
-|    v
-|   [ ] \
-|   [ ]_|   <-- old scalar loop to handle remainder.
- \   |
-  \  v
+   /  |
+   /   v
+   |   [ ]     <--- new preheader.
+   |    |
+   |    v
+   |   [ ] \
+   |   [ ]_|   <-- old scalar loop to handle remainder.
+   \   |
+   \  v
    >[ ]     <-- exit block.
    ...
    */
@@ -862,7 +442,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // don't have a single induction variable.
   OldInduction = Legal->getInduction();
   Type *IdxTy = OldInduction ? OldInduction->getType() :
-    DL->getIntPtrType(SE->getContext());
+  DL->getIntPtrType(SE->getContext());
 
   // Find the loop boundaries.
   const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
@@ -884,8 +464,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // value from the induction PHI node. If we don't have an induction variable
   // then we know that it starts at zero.
   Value *StartIdx = OldInduction ?
-    OldInduction->getIncomingValueForBlock(BypassBlock):
-    ConstantInt::get(IdxTy, 0);
+  OldInduction->getIncomingValueForBlock(BypassBlock):
+  ConstantInt::get(IdxTy, 0);
 
   assert(BypassBlock && "Invalid loop structure");
 
@@ -895,13 +475,13 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Split the single block loop into the two loop structure described above.
   BasicBlock *VectorPH =
-      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
   BasicBlock *VecBody =
-    VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
   BasicBlock *MiddleBlock =
-    VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
   BasicBlock *ScalarPH =
-    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
 
   // This is the location in which we add all of the logic for bypassing
   // the new vector loop.
@@ -958,8 +538,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // PHIs that are left in the scalar version of the loop.
   // The starting values of PHI nodes depend on the counter of the last
   // iteration in the vectorized loop.
-  // If we come from a bypass edge then we need to start from the original start
-  // value.
+  // If we come from a bypass edge then we need to start from the original
+  // start value.
 
   // This variable saves the new starting index for the scalar loop.
   PHINode *ResumeIndex = 0;
@@ -969,7 +549,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
     PHINode *OrigPhi = I->first;
     LoopVectorizationLegality::InductionInfo II = I->second;
     PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
-                                           MiddleBlock->getTerminator());
+                                         MiddleBlock->getTerminator());
     Value *EndValue = 0;
     switch (II.IK) {
     case LoopVectorizationLegality::NoInduction:
@@ -1149,8 +729,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //
   //===------------------------------------------------===//
   BasicBlock &BB = *OrigLoop->getHeader();
-  Constant *Zero = ConstantInt::get(
-    IntegerType::getInt32Ty(BB.getContext()), 0);
+  Constant *Zero =
+  ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0);
 
   // In order to support reduction variables we need to be able to vectorize
   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
@@ -1191,7 +771,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     assert(Legal->getReductionVars()->count(RdxPhi) &&
            "Unable to find the reduction variable");
     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
-      (*Legal->getReductionVars())[RdxPhi];
+    (*Legal->getReductionVars())[RdxPhi];
 
     // We need to generate a reduction vector from the incoming scalar.
     // To do so, we need to generate the 'identity' vector and overide
@@ -1211,7 +791,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // This vector is the Identity vector where the first element is the
     // incoming scalar reduction.
     Value *VectorStart = Builder.CreateInsertElement(Identity,
-                                                    RdxDesc.StartValue, Zero);
+                                                     RdxDesc.StartValue, Zero);
 
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
@@ -1239,29 +819,29 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Extract the first scalar.
     Value *Scalar0 =
-      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
+    Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
     // Extract and reduce the remaining vector elements.
     for (unsigned i=1; i < VF; ++i) {
       Value *Scalar1 =
-        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
+      Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
       switch (RdxDesc.Kind) {
-        case LoopVectorizationLegality::IntegerAdd:
-          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx");
-          break;
-        case LoopVectorizationLegality::IntegerMult:
-          Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx");
-          break;
-        case LoopVectorizationLegality::IntegerOr:
-          Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx");
-          break;
-        case LoopVectorizationLegality::IntegerAnd:
-          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx");
-          break;
-        case LoopVectorizationLegality::IntegerXor:
-          Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx");
-          break;
-        default:
-          llvm_unreachable("Unknown reduction operation");
+      case LoopVectorizationLegality::IntegerAdd:
+        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerMult:
+        Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerOr:
+        Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerAnd:
+        Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerXor:
+        Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx");
+        break;
+      default:
+        llvm_unreachable("Unknown reduction operation");
       }
     }
 
@@ -1323,13 +903,14 @@ Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
 
   // Loop incoming mask is all-one.
-  if (OrigLoop->getHeader() == BB)
-    return getVectorValue(
-      ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1));
+  if (OrigLoop->getHeader() == BB) {
+    Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
+    return getVectorValue(C);
+  }
 
   // This is the block mask. We OR all incoming edges, and with zero.
-  Value *BlockMask = getVectorValue(
-    ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0));
+  Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
+  Value *BlockMask = getVectorValue(Zero);
 
   // For each pred:
   for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it)
@@ -1347,306 +928,308 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     switch (it->getOpcode()) {
-      case Instruction::Br:
-        // Nothing to do for PHIs and BR, since we already took care of the
-        // loop control flow instructions.
-        continue;
-      case Instruction::PHI:{
-        PHINode* P = cast<PHINode>(it);
-        // Handle reduction variables:
-        if (Legal->getReductionVars()->count(P)) {
-          // This is phase one of vectorizing PHIs.
-          Type *VecTy = VectorType::get(it->getType(), VF);
-          WidenMap[it] =
+    case Instruction::Br:
+      // Nothing to do for PHIs and BR, since we already took care of the
+      // loop control flow instructions.
+      continue;
+    case Instruction::PHI:{
+      PHINode* P = cast<PHINode>(it);
+      // Handle reduction variables:
+      if (Legal->getReductionVars()->count(P)) {
+        // This is phase one of vectorizing PHIs.
+        Type *VecTy = VectorType::get(it->getType(), VF);
+        WidenMap[it] =
           PHINode::Create(VecTy, 2, "vec.phi",
                           LoopVectorBody->getFirstInsertionPt());
-          PV->push_back(P);
-          continue;
-        }
-
-        // Check for PHI nodes that are lowered to vector selects.
-        if (P->getParent() != OrigLoop->getHeader()) {
-          // We know that all PHIs in non header blocks are converted into
-          // selects, so we don't have to worry about the insertion order and we
-          // can just use the builder.
-
-          // At this point we generate the predication tree. There may be
-          // duplications since this is a simple recursive scan, but future
-          // optimizations will clean it up.
-          Value *Cond = createBlockInMask(P->getIncomingBlock(0));
-          WidenMap[P] =
-            Builder.CreateSelect(Cond,
-                                 getVectorValue(P->getIncomingValue(0)),
-                                 getVectorValue(P->getIncomingValue(1)),
-                                 "predphi");
-          continue;
-        }
-
-        // This PHINode must be an induction variable.
-        // Make sure that we know about it.
-        assert(Legal->getInductionVars()->count(P) &&
-               "Not an induction variable");
+        PV->push_back(P);
+        continue;
+      }
 
-        LoopVectorizationLegality::InductionInfo II =
-          Legal->getInductionVars()->lookup(P);
+      // Check for PHI nodes that are lowered to vector selects.
+      if (P->getParent() != OrigLoop->getHeader()) {
+        // We know that all PHIs in non header blocks are converted into
+        // selects, so we don't have to worry about the insertion order and we
+        // can just use the builder.
+
+        // At this point we generate the predication tree. There may be
+        // duplications since this is a simple recursive scan, but future
+        // optimizations will clean it up.
+        Value *Cond = createBlockInMask(P->getIncomingBlock(0));
+        WidenMap[P] =
+          Builder.CreateSelect(Cond,
+                               getVectorValue(P->getIncomingValue(0)),
+                               getVectorValue(P->getIncomingValue(1)),
+                               "predphi");
+        continue;
+      }
 
-        switch (II.IK) {
-        case LoopVectorizationLegality::NoInduction:
-          llvm_unreachable("Unknown induction");
-        case LoopVectorizationLegality::IntInduction: {
-          assert(P == OldInduction && "Unexpected PHI");
-          Value *Broadcasted = getBroadcastInstrs(Induction);
+      // This PHINode must be an induction variable.
+      // Make sure that we know about it.
+      assert(Legal->getInductionVars()->count(P) &&
+             "Not an induction variable");
+
+      LoopVectorizationLegality::InductionInfo II =
+        Legal->getInductionVars()->lookup(P);
+
+      switch (II.IK) {
+      case LoopVectorizationLegality::NoInduction:
+        llvm_unreachable("Unknown induction");
+      case LoopVectorizationLegality::IntInduction: {
+        assert(P == OldInduction && "Unexpected PHI");
+        Value *Broadcasted = getBroadcastInstrs(Induction);
+        // After broadcasting the induction variable we need to make the
+        // vector consecutive by adding 0, 1, 2 ...
+        Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
+        WidenMap[OldInduction] = ConsecutiveInduction;
+        continue;
+      }
+      case LoopVectorizationLegality::ReverseIntInduction:
+      case LoopVectorizationLegality::PtrInduction:
+        // Handle reverse integer and pointer inductions.
+        Value *StartIdx = 0;
+        // If we have a single integer induction variable then use it.
+        // Otherwise, start counting at zero.
+        if (OldInduction) {
+          LoopVectorizationLegality::InductionInfo OldII =
+            Legal->getInductionVars()->lookup(OldInduction);
+          StartIdx = OldII.StartValue;
+        } else {
+          StartIdx = ConstantInt::get(Induction->getType(), 0);
+        }
+        // This is the normalized GEP that starts counting at zero.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                 "normalized.idx");
+
+        // Handle the reverse integer induction variable case.
+        if (LoopVectorizationLegality::ReverseIntInduction == II.IK) {
+          IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
+          Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
+                                                 "resize.norm.idx");
+          Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
+                                                 "reverse.idx");
+
+          // This is a new value so do not hoist it out.
+          Value *Broadcasted = getBroadcastInstrs(ReverseInd);
           // After broadcasting the induction variable we need to make the
-          // vector consecutive by adding 0, 1, 2 ...
-          Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
-          WidenMap[OldInduction] = ConsecutiveInduction;
+          // vector consecutive by adding  ... -3, -2, -1, 0.
+          Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted,
+                                                             true);
+          WidenMap[it] = ConsecutiveInduction;
           continue;
         }
-        case LoopVectorizationLegality::ReverseIntInduction:
-        case LoopVectorizationLegality::PtrInduction:
-          // Handle reverse integer and pointer inductions.
-          Value *StartIdx = 0;
-          // If we have a single integer induction variable then use it.
-          // Otherwise, start counting at zero.
-          if (OldInduction) {
-            LoopVectorizationLegality::InductionInfo OldII =
-              Legal->getInductionVars()->lookup(OldInduction);
-            StartIdx = OldII.StartValue;
-          } else {
-            StartIdx = ConstantInt::get(Induction->getType(), 0);
-          }
-          // This is the normalized GEP that starts counting at zero.
-          Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
-                                                   "normalized.idx");
-
-          // Handle the reverse integer induction variable case.
-          if (LoopVectorizationLegality::ReverseIntInduction == II.IK) {
-            IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
-            Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
-                                                   "resize.norm.idx");
-            Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
-                                                   "reverse.idx");
-
-            // This is a new value so do not hoist it out.
-            Value *Broadcasted = getBroadcastInstrs(ReverseInd);
-            // After broadcasting the induction variable we need to make the
-            // vector consecutive by adding  ... -3, -2, -1, 0.
-            Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted,
-                                                               true);
-            WidenMap[it] = ConsecutiveInduction;
-            continue;
-          }
-
-          // Handle the pointer induction variable case.
-          assert(P->getType()->isPointerTy() && "Unexpected type.");
-
-          // This is the vector of results. Notice that we don't generate vector
-          // geps because scalar geps result in better code.
-          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
-          for (unsigned int i = 0; i < VF; ++i) {
-            Constant *Idx = ConstantInt::get(Induction->getType(), i);
-            Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
-            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, "next.gep");
-            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
-                                                 Builder.getInt32(i),
-                                                 "insert.gep");
-          }
 
-          WidenMap[it] = VecVal;
-          continue;
+        // Handle the pointer induction variable case.
+        assert(P->getType()->isPointerTy() && "Unexpected type.");
+
+        // This is the vector of results. Notice that we don't generate
+        // vector geps because scalar geps result in better code.
+        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+        for (unsigned int i = 0; i < VF; ++i) {
+          Constant *Idx = ConstantInt::get(Induction->getType(), i);
+          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
+                                               "gep.idx");
+          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                             "next.gep");
+          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                               Builder.getInt32(i),
+                                               "insert.gep");
         }
 
-      }// End of PHI.
-
-      case Instruction::Add:
-      case Instruction::FAdd:
-      case Instruction::Sub:
-      case Instruction::FSub:
-      case Instruction::Mul:
-      case Instruction::FMul:
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv:
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem:
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor: {
-        // Just widen binops.
-        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
-        Value *A = getVectorValue(it->getOperand(0));
-        Value *B = getVectorValue(it->getOperand(1));
-
-        // Use this vector value for all users of the original instruction.
-        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
-        WidenMap[it] = V;
-
-        // Update the NSW, NUW and Exact flags.
-        BinaryOperator *VecOp = cast<BinaryOperator>(V);
-        if (isa<OverflowingBinaryOperator>(BinOp)) {
-          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
-          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
-        }
-        if (isa<PossiblyExactOperator>(VecOp))
-          VecOp->setIsExact(BinOp->isExact());
-        break;
-      }
-      case Instruction::Select: {
-        // Widen selects.
-        // If the selector is loop invariant we can create a select
-        // instruction with a scalar condition. Otherwise, use vector-select.
-        Value *Cond = it->getOperand(0);
-        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
-
-        // The condition can be loop invariant  but still defined inside the
-        // loop. This means that we can't just use the original 'cond' value.
-        // We have to take the 'vectorized' value and pick the first lane.
-        // Instcombine will make this a no-op.
-        Cond = getVectorValue(Cond);
-        if (InvariantCond)
-          Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
-
-        Value *Op0 = getVectorValue(it->getOperand(1));
-        Value *Op1 = getVectorValue(it->getOperand(2));
-        WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1);
-        break;
+        WidenMap[it] = VecVal;
+        continue;
       }
 
-      case Instruction::ICmp:
-      case Instruction::FCmp: {
-        // Widen compares. Generate vector compares.
-        bool FCmp = (it->getOpcode() == Instruction::FCmp);
-        CmpInst *Cmp = dyn_cast<CmpInst>(it);
-        Value *A = getVectorValue(it->getOperand(0));
-        Value *B = getVectorValue(it->getOperand(1));
-        if (FCmp)
-          WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
-        else
-          WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
-        break;
+    }// End of PHI.
+
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      // Just widen binops.
+      BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
+      Value *A = getVectorValue(it->getOperand(0));
+      Value *B = getVectorValue(it->getOperand(1));
+
+      // Use this vector value for all users of the original instruction.
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+      WidenMap[it] = V;
+
+      // Update the NSW, NUW and Exact flags.
+      BinaryOperator *VecOp = cast<BinaryOperator>(V);
+      if (isa<OverflowingBinaryOperator>(BinOp)) {
+        VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+        VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
       }
+      if (isa<PossiblyExactOperator>(VecOp))
+        VecOp->setIsExact(BinOp->isExact());
+      break;
+    }
+    case Instruction::Select: {
+      // Widen selects.
+      // If the selector is loop invariant we can create a select
+      // instruction with a scalar condition. Otherwise, use vector-select.
+      Value *Cond = it->getOperand(0);
+      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
+
+      // The condition can be loop invariant  but still defined inside the
+      // loop. This means that we can't just use the original 'cond' value.
+      // We have to take the 'vectorized' value and pick the first lane.
+      // Instcombine will make this a no-op.
+      Cond = getVectorValue(Cond);
+      if (InvariantCond)
+        Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
+
+      Value *Op0 = getVectorValue(it->getOperand(1));
+      Value *Op1 = getVectorValue(it->getOperand(2));
+      WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1);
+      break;
+    }
 
-      case Instruction::Store: {
-        // Attempt to issue a wide store.
-        StoreInst *SI = dyn_cast<StoreInst>(it);
-        Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
-        Value *Ptr = SI->getPointerOperand();
-        unsigned Alignment = SI->getAlignment();
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      // Widen compares. Generate vector compares.
+      bool FCmp = (it->getOpcode() == Instruction::FCmp);
+      CmpInst *Cmp = dyn_cast<CmpInst>(it);
+      Value *A = getVectorValue(it->getOperand(0));
+      Value *B = getVectorValue(it->getOperand(1));
+      if (FCmp)
+        WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+      else
+        WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+      break;
+    }
 
-        assert(!Legal->isUniform(Ptr) &&
-               "We do not allow storing to uniform addresses");
+    case Instruction::Store: {
+      // Attempt to issue a wide store.
+      StoreInst *SI = dyn_cast<StoreInst>(it);
+      Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+      Value *Ptr = SI->getPointerOperand();
+      unsigned Alignment = SI->getAlignment();
 
-        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+      assert(!Legal->isUniform(Ptr) &&
+             "We do not allow storing to uniform addresses");
 
-        // This store does not use GEPs.
-        if (!Legal->isConsecutivePtr(Ptr)) {
-          scalarizeInstruction(it);
-          break;
-        }
+      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
-        if (Gep) {
-          // The last index does not have to be the induction. It can be
-          // consecutive and be a function of the index. For example A[I+1];
-          unsigned NumOperands = Gep->getNumOperands();
-          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
-          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-          // Create the new GEP with the new induction variable.
-          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-          Gep2->setOperand(NumOperands - 1, LastIndex);
-          Ptr = Builder.Insert(Gep2);
-        } else {
-          // Use the induction element ptr.
-          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
-        }
-        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
-        Value *Val = getVectorValue(SI->getValueOperand());
-        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
+      // This store does not use GEPs.
+      if (!Legal->isConsecutivePtr(Ptr)) {
+        scalarizeInstruction(it);
         break;
       }
-      case Instruction::Load: {
-        // Attempt to issue a wide load.
-        LoadInst *LI = dyn_cast<LoadInst>(it);
-        Type *RetTy = VectorType::get(LI->getType(), VF);
-        Value *Ptr = LI->getPointerOperand();
-        unsigned Alignment = LI->getAlignment();
-        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-
-        // If the pointer is loop invariant or if it is non consecutive,
-        // scalarize the load.
-        bool Con = Legal->isConsecutivePtr(Ptr);
-        if (Legal->isUniform(Ptr) || !Con) {
-          scalarizeInstruction(it);
-          break;
-        }
 
-        if (Gep) {
-          // The last index does not have to be the induction. It can be
-          // consecutive and be a function of the index. For example A[I+1];
-          unsigned NumOperands = Gep->getNumOperands();
-          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-          // Create the new GEP with the new induction variable.
-          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-          Gep2->setOperand(NumOperands - 1, LastIndex);
-          Ptr = Builder.Insert(Gep2);
-        } else {
-          // Use the induction element ptr.
-          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
-        }
-
-        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
-        LI = Builder.CreateLoad(Ptr);
-        LI->setAlignment(Alignment);
-        // Use this vector value for all users of the load.
-        WidenMap[it] = LI;
-        break;
+      if (Gep) {
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+      } else {
+        // Use the induction element ptr.
+        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
       }
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::FPExt:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-      case Instruction::Trunc:
-      case Instruction::FPTrunc:
-      case Instruction::BitCast: {
-        /// Vectorize bitcasts.
-        CastInst *CI = dyn_cast<CastInst>(it);
-        Value *A = getVectorValue(it->getOperand(0));
-        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-        WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+      Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
+      Value *Val = getVectorValue(SI->getValueOperand());
+      Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
+      break;
+    }
+    case Instruction::Load: {
+      // Attempt to issue a wide load.
+      LoadInst *LI = dyn_cast<LoadInst>(it);
+      Type *RetTy = VectorType::get(LI->getType(), VF);
+      Value *Ptr = LI->getPointerOperand();
+      unsigned Alignment = LI->getAlignment();
+      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
+      // If the pointer is loop invariant or if it is non consecutive,
+      // scalarize the load.
+      bool Con = Legal->isConsecutivePtr(Ptr);
+      if (Legal->isUniform(Ptr) || !Con) {
+        scalarizeInstruction(it);
         break;
       }
-        
-      case Instruction::Call: {
-        assert(isTriviallyVectorizableIntrinsic(it));
-        Module *M = BB->getParent()->getParent();
-        IntrinsicInst *II = cast<IntrinsicInst>(it);
-        Intrinsic::ID ID = II->getIntrinsicID();
-        SmallVector<Value*, 4> Args;
-        for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 
-          Args.push_back(getVectorValue(II->getArgOperand(i)));
-        Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
-        Function *F = Intrinsic::getDeclaration(M, ID, Tys);
-        WidenMap[it] = Builder.CreateCall(F, Args);
-        break;
+
+      if (Gep) {
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+      } else {
+        // Use the induction element ptr.
+        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
       }
 
-      default:
-        // All other instructions are unsupported. Scalarize them.
-        scalarizeInstruction(it);
-        break;
+      Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
+      LI = Builder.CreateLoad(Ptr);
+      LI->setAlignment(Alignment);
+      // Use this vector value for all users of the load.
+      WidenMap[it] = LI;
+      break;
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      /// Vectorize bitcasts.
+      CastInst *CI = dyn_cast<CastInst>(it);
+      Value *A = getVectorValue(it->getOperand(0));
+      Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+      WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+      break;
+    }
+
+    case Instruction::Call: {
+      assert(isTriviallyVectorizableIntrinsic(it));
+      Module *M = BB->getParent()->getParent();
+      IntrinsicInst *II = cast<IntrinsicInst>(it);
+      Intrinsic::ID ID = II->getIntrinsicID();
+      SmallVector<Value*, 4> Args;
+      for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
+        Args.push_back(getVectorValue(II->getArgOperand(i)));
+      Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
+      Function *F = Intrinsic::getDeclaration(M, ID, Tys);
+      WidenMap[it] = Builder.CreateCall(F, Args);
+      break;
+    }
+
+    default:
+      // All other instructions are unsupported. Scalarize them.
+      scalarizeInstruction(it);
+      break;
     }// end of switch.
   }// end of for_each instr.
 }
@@ -1958,8 +1541,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   // Check if we see any stores. If there are no stores, then we don't
   // care if the pointers are *restrict*.
   if (!Stores.size()) {
-        DEBUG(dbgs() << "LV: Found a read-only loop!\n");
-        return true;
+    DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+    return true;
   }
 
   // Holds the read and read-write *pointers* that we find.
@@ -2171,15 +1754,15 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     // We found a reduction var if we have reached the original
     // phi node and we only have a single instruction with out-of-loop
     // users.
-   if (FoundStartPHI && ExitInstruction) {
-     // This instruction is allowed to have out-of-loop users.
-     AllowedExit.insert(ExitInstruction);
+    if (FoundStartPHI && ExitInstruction) {
+      // This instruction is allowed to have out-of-loop users.
+      AllowedExit.insert(ExitInstruction);
 
-     // Save the description of this reduction variable.
-     ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
-     Reductions[Phi] = RD;
-     return true;
-   }
+      // Save the description of this reduction variable.
+      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+      Reductions[Phi] = RD;
+      return true;
+    }
 
     // If we've reached the start PHI but did not find an outside user then
     // this is dead code. Abort.
@@ -2191,24 +1774,24 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 bool
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
                                             ReductionKind Kind) {
-    switch (I->getOpcode()) {
-    default:
-      return false;
-    case Instruction::PHI:
-      // possibly.
-      return true;
-    case Instruction::Add:
-    case Instruction::Sub:
-      return Kind == IntegerAdd;
-    case Instruction::Mul:
-      return Kind == IntegerMult;
-    case Instruction::And:
-      return Kind == IntegerAnd;
-    case Instruction::Or:
-      return Kind == IntegerOr;
-    case Instruction::Xor:
-      return Kind == IntegerXor;
-    }
+  switch (I->getOpcode()) {
+  default:
+    return false;
+  case Instruction::PHI:
+    // possibly.
+    return true;
+  case Instruction::Add:
+  case Instruction::Sub:
+    return Kind == IntegerAdd;
+  case Instruction::Mul:
+    return Kind == IntegerMult;
+  case Instruction::And:
+    return Kind == IntegerAnd;
+  case Instruction::Or:
+    return Kind == IntegerOr;
+  case Instruction::Xor:
+    return Kind == IntegerXor;
+  }
 }
 
 LoopVectorizationLegality::InductionKind
@@ -2265,12 +1848,12 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
 
     // The isntructions below can trap.
     switch (it->getOpcode()) {
-      default: continue;
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::URem:
-      case Instruction::SRem:
-        return false;
+    default: continue;
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+             return false;
     }
   }
 
@@ -2356,153 +1939,154 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
 
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
-    case Instruction::GetElementPtr:
-      // We mark this instruction as zero-cost because scalar GEPs are usually
-      // lowered to the intruction addressing mode. At the moment we don't
-      // generate vector geps.
-      return 0;
-    case Instruction::Br: {
-      return VTTI->getCFInstrCost(I->getOpcode());
-    }
-    case Instruction::PHI:
-      //TODO: IF-converted IFs become selects.
-      return 0;
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-      return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
-    case Instruction::Select: {
-      SelectInst *SI = cast<SelectInst>(I);
-      const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
-      bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
-      Type *CondTy = SI->getCondition()->getType();
-      if (ScalarCond)
-        CondTy = VectorType::get(CondTy, VF);
-
-      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
-    }
-    case Instruction::ICmp:
-    case Instruction::FCmp: {
-      Type *ValTy = I->getOperand(0)->getType();
-      VectorTy = ToVectorTy(ValTy, VF);
-      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
-    }
-    case Instruction::Store: {
-      StoreInst *SI = cast<StoreInst>(I);
-      Type *ValTy = SI->getValueOperand()->getType();
-      VectorTy = ToVectorTy(ValTy, VF);
-
-      if (VF == 1)
-        return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
-                              SI->getAlignment(), SI->getPointerAddressSpace());
-
-      // Scalarized stores.
-      if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
-        unsigned Cost = 0;
-        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
-                                              ValTy);
-        // The cost of extracting from the value vector.
-        Cost += VF * (ExtCost);
-        // The cost of the scalar stores.
-        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                           ValTy->getScalarType(),
-                                           SI->getAlignment(),
-                                           SI->getPointerAddressSpace());
-        return Cost;
-      }
-
-      // Wide stores.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
+  case Instruction::GetElementPtr:
+    // We mark this instruction as zero-cost because scalar GEPs are usually
+    // lowered to the intruction addressing mode. At the moment we don't
+    // generate vector geps.
+    return 0;
+  case Instruction::Br: {
+    return VTTI->getCFInstrCost(I->getOpcode());
+  }
+  case Instruction::PHI:
+    //TODO: IF-converted IFs become selects.
+    return 0;
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+    Type *CondTy = SI->getCondition()->getType();
+    if (ScalarCond)
+      CondTy = VectorType::get(CondTy, VF);
+
+    return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ValTy = I->getOperand(0)->getType();
+    VectorTy = ToVectorTy(ValTy, VF);
+    return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(I);
+    Type *ValTy = SI->getValueOperand()->getType();
+    VectorTy = ToVectorTy(ValTy, VF);
+
+    if (VF == 1)
+      return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
+                                   SI->getAlignment(),
                                    SI->getPointerAddressSpace());
+
+    // Scalarized stores.
+    if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
+      unsigned Cost = 0;
+      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                            ValTy);
+      // The cost of extracting from the value vector.
+      Cost += VF * (ExtCost);
+      // The cost of the scalar stores.
+      Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                         ValTy->getScalarType(),
+                                         SI->getAlignment(),
+                                         SI->getPointerAddressSpace());
+      return Cost;
     }
-    case Instruction::Load: {
-      LoadInst *LI = cast<LoadInst>(I);
-
-      if (VF == 1)
-        return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
-                                     LI->getAlignment(),
-                                     LI->getPointerAddressSpace());
-
-      // Scalarized loads.
-      if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
-        unsigned Cost = 0;
-        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
-        // The cost of inserting the loaded value into the result vector.
-        Cost += VF * (InCost);
-        // The cost of the scalar stores.
-        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                           RetTy->getScalarType(),
-                                           LI->getAlignment(),
-                                           LI->getPointerAddressSpace());
-        return Cost;
-      }
 
-      // Wide loads.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
+    // Wide stores.
+    return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
+                                 SI->getPointerAddressSpace());
+  }
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(I);
+
+    if (VF == 1)
+      return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
+                                   LI->getAlignment(),
                                    LI->getPointerAddressSpace());
-    }
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-      return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
-    }
-    case Instruction::Call: {
-      assert(isTriviallyVectorizableIntrinsic(I));
-      IntrinsicInst *II = cast<IntrinsicInst>(I);
-      Type *RetTy = ToVectorTy(II->getType(), VF);
-      SmallVector<Type*, 4> Tys;
-      for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) 
-        Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
-      return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
-    }
-    default: {
-      // We are scalarizing the instruction. Return the cost of the scalar
-      // instruction, plus the cost of insert and extract into vector
-      // elements, times the vector width.
+
+    // Scalarized loads.
+    if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
       unsigned Cost = 0;
+      unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
+      // The cost of inserting the loaded value into the result vector.
+      Cost += VF * (InCost);
+      // The cost of the scalar stores.
+      Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                         RetTy->getScalarType(),
+                                         LI->getAlignment(),
+                                         LI->getPointerAddressSpace());
+      return Cost;
+    }
 
-      bool IsVoid = RetTy->isVoidTy();
+    // Wide loads.
+    return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
+                                 LI->getPointerAddressSpace());
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+    return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+  }
+  case Instruction::Call: {
+    assert(isTriviallyVectorizableIntrinsic(I));
+    IntrinsicInst *II = cast<IntrinsicInst>(I);
+    Type *RetTy = ToVectorTy(II->getType(), VF);
+    SmallVector<Type*, 4> Tys;
+    for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
+      Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
+    return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
+  }
+  default: {
+    // We are scalarizing the instruction. Return the cost of the scalar
+    // instruction, plus the cost of insert and extract into vector
+    // elements, times the vector width.
+    unsigned Cost = 0;
 
-      unsigned InsCost = (IsVoid ? 0 :
-                          VTTI->getInstrCost(Instruction::InsertElement,
-                                             VectorTy));
+    bool IsVoid = RetTy->isVoidTy();
 
-      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
-                                            VectorTy);
+    unsigned InsCost = (IsVoid ? 0 :
+                        VTTI->getInstrCost(Instruction::InsertElement,
+                                           VectorTy));
 
-      // The cost of inserting the results plus extracting each one of the
-      // operands.
-      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+    unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                          VectorTy);
 
-      // The cost of executing VF copies of the scalar instruction.
-      Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
-      return Cost;
-    }
+    // The cost of inserting the results plus extracting each one of the
+    // operands.
+    Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+
+    // The cost of executing VF copies of the scalar instruction.
+    Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
+    return Cost;
+  }
   }// end of switch.
 }
 
@@ -2512,8 +2096,6 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
   return VectorType::get(Scalar, VF);
 }
 
-} // namespace
-
 char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
@@ -2527,3 +2109,5 @@ namespace llvm {
     return new LoopVectorize();
   }
 }
+
+
-- 
cgit v1.1


From cfb6285fdb27fba97ddc3ebf6485110d9a157ac5 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 11 Dec 2012 04:55:10 +0000
Subject: Fix PR14565. Don't if-convert loops that have switch statements in
 them.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169813 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index feeecec..a691240 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1263,6 +1263,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
     BasicBlock *BB = LoopBlocks[i];
 
+    // We don't support switch statements inside loops.
+    if (!isa<BranchInst>(BB->getTerminator()))
+      return false;
+
     // We must have at most two predecessors because we need to convert
     // all PHIs to selects.
     unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
-- 
cgit v1.1


From 5e9efa10fc7eea582bb1ad216364fb2c467e4477 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 11 Dec 2012 18:58:10 +0000
Subject: Loop Vectorize: optimize the vectorization of trunc(induction_var).
 The truncation is now done on scalars.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169904 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a691240..9550e8a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1204,8 +1204,20 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
-      /// Vectorize bitcasts.
       CastInst *CI = dyn_cast<CastInst>(it);
+      /// Optimize the special case where the source is the induction
+      /// variable. Notice that we can only optimize the 'trunc' case
+      /// because: a. FP conversions lose precision, b. sext/zext may wrap,
+      /// c. other casts depend on pointer size.
+      if (CI->getOperand(0) == OldInduction &&
+          it->getOpcode() == Instruction::Trunc) {
+        Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
+                                               CI->getType());
+        Value *Broadcasted = getBroadcastInstrs(ScalarCast);
+        WidenMap[it] = getConsecutiveVector(Broadcasted);
+        break;
+      }
+      /// Vectorize casts.
       Value *A = getVectorValue(it->getOperand(0));
       Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
       WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
-- 
cgit v1.1


From 655d2c5354fcd44c329d99428c7d9196bc78dbad Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 11 Dec 2012 21:30:14 +0000
Subject: PR14574. Fix a bug in the code that calculates the mask the converted
 PHIs in if-conversion.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169916 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9550e8a..baa655c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -954,7 +954,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // At this point we generate the predication tree. There may be
         // duplications since this is a simple recursive scan, but future
         // optimizations will clean it up.
-        Value *Cond = createBlockInMask(P->getIncomingBlock(0));
+        Value *Cond = createEdgeMask(P->getIncomingBlock(0), P->getParent());
         WidenMap[P] =
           Builder.CreateSelect(Cond,
                                getVectorValue(P->getIncomingValue(0)),
-- 
cgit v1.1


From 5da804150d418b8b4956b84013f8f67df553c543 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 12 Dec 2012 01:11:46 +0000
Subject: LoopVectorizer: When -Os is used, vectorize only loops that dont
 require a tail loop. There is no testcase because I dont know of a way to
 initialize the loop vectorizer pass without adding an additional hidden flag.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169950 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 82 ++++++++++++++++++++++--------
 1 file changed, 60 insertions(+), 22 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index baa655c..d796716 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -51,9 +51,12 @@ namespace {
 
 /// The LoopVectorize Pass.
 struct LoopVectorize : public LoopPass {
-  static char ID; // Pass identification, replacement for typeid
+  /// Pass identification, replacement for typeid
+  static char ID;
+  /// Optimize for size. Do not generate tail loops.
+  bool OptForSize;
 
-  LoopVectorize() : LoopPass(ID) {
+  explicit LoopVectorize(bool OptSz = false) : LoopPass(ID), OptForSize(OptSz) {
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
@@ -85,23 +88,17 @@ struct LoopVectorize : public LoopPass {
     }
 
     // Select the preffered vectorization factor.
-    unsigned VF = 1;
-    if (VectorizationFactor == 0) {
-      const VectorTargetTransformInfo *VTTI = 0;
-      if (TTI)
-        VTTI = TTI->getVectorTargetTransformInfo();
-      // Use the cost model.
-      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
-      VF = CM.findBestVectorizationFactor();
-
-      if (VF == 1) {
-        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-        return false;
-      }
-
-    } else {
-      // Use the user command flag.
-      VF = VectorizationFactor;
+    const VectorTargetTransformInfo *VTTI = 0;
+    if (TTI)
+      VTTI = TTI->getVectorTargetTransformInfo();
+    // Use the cost model.
+    LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
+    unsigned VF = CM.selectVectorizationFactor(OptForSize,
+                                                 VectorizationFactor);
+
+    if (VF == 1) {
+      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+      return false;
     }
 
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
@@ -1886,7 +1883,48 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
 }
 
 unsigned
-LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
+LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
+                                                        unsigned UserVF) {
+  if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
+    DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
+    return 1;
+  }
+
+  // Find the trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
+  DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+
+  unsigned VF = MaxVectorSize;
+
+  // If we optimize the program for size, avoid creating the tail loop.
+  if (OptForSize) {
+    // If we are unable to calculate the trip count then don't try to vectorize.
+    if (TC < 2) {
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+      return 1;
+    }
+
+    // Find the maximum SIMD width that can fit within the trip count.
+    VF = TC % MaxVectorSize;
+
+    if (VF == 0)
+      VF = MaxVectorSize;
+
+    // If the trip count that we found modulo the vectorization factor is not
+    // zero then we require a tail.
+    if (VF < 2) {
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+      return 1;
+    }
+  }
+
+  if (UserVF != 0) {
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
+
+    return UserVF;
+  }
+
   if (!VTTI) {
     DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
     return 1;
@@ -2121,8 +2159,8 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
-  Pass *createLoopVectorizePass() {
-    return new LoopVectorize();
+  Pass *createLoopVectorizePass(bool OptForSize = false) {
+    return new LoopVectorize(OptForSize);
   }
 }
 
-- 
cgit v1.1


From 6c645a35ccc658b20506d3137890d9003edd3527 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 12 Dec 2012 01:31:10 +0000
Subject: fix a typo.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169953 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index d796716..5c5e9af 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1859,7 +1859,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
     if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
       return false;
 
-    // The isntructions below can trap.
+    // The instructions below can trap.
     switch (it->getOpcode()) {
     default: continue;
     case Instruction::UDiv:
-- 
cgit v1.1


From db65ff39faee4cab994d3b7ece1a31dfd9343818 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 12 Dec 2012 01:33:47 +0000
Subject: Fix the ascii drawing that was ruined when I split the H and CPP

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169955 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5c5e9af..da073c5 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -404,27 +404,27 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-   [ ] <-- vector loop bypass.
-   /  |
-   /   v
+       [ ] <-- vector loop bypass.
+     /  |
+    /   v
    |   [ ]     <-- vector pre header.
    |    |
    |    v
    |   [  ] \
    |   [  ]_|   <-- vector loop.
    |    |
-   \   v
-   >[ ]   <--- middle-block.
-   /  |
-   /   v
+    \   v
+      >[ ]   <--- middle-block.
+     /  |
+    /   v
    |   [ ]     <--- new preheader.
    |    |
    |    v
    |   [ ] \
    |   [ ]_|   <-- old scalar loop to handle remainder.
-   \   |
-   \  v
-   >[ ]     <-- exit block.
+    \   |
+     \  v
+      >[ ]     <-- exit block.
    ...
    */
 
-- 
cgit v1.1


From ae3b652f5cc19d83b6466d4fa70a7d1c7fb6d06c Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 12 Dec 2012 19:29:45 +0000
Subject: LoopVectorizer: Use the "optsize" attribute to decide if we are
 allowed to increase the function size.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170004 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index da073c5..749b664 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -53,10 +53,8 @@ namespace {
 struct LoopVectorize : public LoopPass {
   /// Pass identification, replacement for typeid
   static char ID;
-  /// Optimize for size. Do not generate tail loops.
-  bool OptForSize;
 
-  explicit LoopVectorize(bool OptSz = false) : LoopPass(ID), OptForSize(OptSz) {
+  explicit LoopVectorize() : LoopPass(ID) {
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
@@ -93,8 +91,15 @@ struct LoopVectorize : public LoopPass {
       VTTI = TTI->getVectorTargetTransformInfo();
     // Use the cost model.
     LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
+
+    // Check the function attribues to find out if this function should be
+    // optimized for size.
+    Function *F = L->getHeader()->getParent();
+    bool OptForSize =
+    F->getFnAttributes().hasAttribute(Attributes::OptimizeForSize);
+
     unsigned VF = CM.selectVectorizationFactor(OptForSize,
-                                                 VectorizationFactor);
+                                               VectorizationFactor);
 
     if (VF == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
@@ -2159,8 +2164,8 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
-  Pass *createLoopVectorizePass(bool OptForSize = false) {
-    return new LoopVectorize(OptForSize);
+  Pass *createLoopVectorizePass() {
+    return new LoopVectorize();
   }
 }
 
-- 
cgit v1.1


From d0b144c04c10c62ad51d0e171680d9ce95af4078 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 12 Dec 2012 19:39:36 +0000
Subject: Fix indentation.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170005 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 749b664..cdd130f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -95,11 +95,10 @@ struct LoopVectorize : public LoopPass {
     // Check the function attribues to find out if this function should be
     // optimized for size.
     Function *F = L->getHeader()->getParent();
-    bool OptForSize =
-    F->getFnAttributes().hasAttribute(Attributes::OptimizeForSize);
+    Attributes::AttrVal SzAttr= Attributes::OptimizeForSize;
+    bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr);
 
-    unsigned VF = CM.selectVectorizationFactor(OptForSize,
-                                               VectorizationFactor);
+    unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
 
     if (VF == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-- 
cgit v1.1


From 807dad62a0d4f1a1bbcb76fdc447634f76308252 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 13 Dec 2012 00:21:03 +0000
Subject: Teach the cost model about the optimization in r169904: Truncation of
 induction variables costs the same as scalar trunc.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170051 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index cdd130f..475bea1 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -106,8 +106,7 @@ struct LoopVectorize : public LoopPass {
     }
 
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
-          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
-          "\n");
+          F->getParent()->getModuleIdentifier()<<"\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
     InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF);
@@ -1849,6 +1848,15 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   return NoInduction;
 }
 
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+  Value *In0 = const_cast<Value*>(V);
+  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+  if (!PN)
+    return false;
+
+  return Inductions.count(PN);
+}
+
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
   assert(TheLoop->contains(BB) && "Unknown block used");
 
@@ -2110,6 +2118,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
+    // We optimize the truncation of induction variable.
+    // The cost of these is the same as the scalar operation.
+    if (I->getOpcode() == Instruction::Trunc &&
+        Legal->isInductionVariable(I->getOperand(0)))
+         return VTTI->getCastInstrCost(I->getOpcode(), I->getType(),
+                                       I->getOperand(0)->getType());
+
     Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
     return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
   }
-- 
cgit v1.1


From 9ad73e93a51c473a90e010a489a9c7c221112030 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 13 Dec 2012 23:11:54 +0000
Subject: Enable the Loop Vectorizer by default for O2 and O3. Disable
 if-conversion by default. I plan to revert this patch later today.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170157 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 475bea1..d143f91 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -44,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
 
 static cl::opt<bool>
-EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
 namespace {
-- 
cgit v1.1


From 0ef0e2e6d0a45cdbc792eee9d76f0a4b7cda5c8f Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Tue, 18 Dec 2012 18:40:20 +0000
Subject: LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector
 ops instead of scalar operations.

For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
	movdqa	%xmm0, %xmm1
	movhlps	%xmm1, %xmm1            ## xmm1 = xmm1[1,1]
	paddw	%xmm0, %xmm1
	pshufd	$1, %xmm1, %xmm0        ## xmm0 = xmm1[1,0,0,0]
	paddw	%xmm1, %xmm0
	phaddw	%xmm0, %xmm0
	pextrb	$0, %xmm0, %edx

instead of
	pextrb	$2, %xmm0, %esi
	pextrb	$0, %xmm0, %edx
	addb	%sil, %dl
	pextrb	$4, %xmm0, %esi
	addb	%dl, %sil
	pextrb	$6, %xmm0, %edx
	addb	%sil, %dl
	pextrb	$8, %xmm0, %esi
	addb	%dl, %sil
	pextrb	$10, %xmm0, %edi
	pextrb	$14, %xmm0, %edx
	addb	%sil, %dil
	pextrb	$12, %xmm0, %esi
	addb	%dil, %sil
	addb	%sil, %dl

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170439 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 43 +++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 12 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index d143f91..e3c76bb 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -817,34 +817,53 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     NewPhi->addIncoming(VectorStart, LoopBypassBlock);
     NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
 
-    // Extract the first scalar.
-    Value *Scalar0 =
-    Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
-    // Extract and reduce the remaining vector elements.
-    for (unsigned i=1; i < VF; ++i) {
-      Value *Scalar1 =
-      Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
+    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+    // and vector ops, reducing the set of values being computed by half each
+    // round.
+    assert(isPowerOf2_32(VF) &&
+           "Reduction emission only supported for pow2 vectors!");
+    Value *TmpVec = NewPhi;
+    SmallVector<Constant*, 32> ShuffleMask(VF, 0);
+    for (unsigned i = VF; i != 1; i >>= 1) {
+      // Move the upper half of the vector to the lower half.
+      for (unsigned j = 0; j != i/2; ++j)
+        ShuffleMask[j] = Builder.getInt32(i/2 + j);
+
+      // Fill the rest of the mask with undef.
+      std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
+                UndefValue::get(Builder.getInt32Ty()));
+
+      Value *Shuf =
+        Builder.CreateShuffleVector(TmpVec,
+                                    UndefValue::get(TmpVec->getType()),
+                                    ConstantVector::get(ShuffleMask),
+                                    "rdx.shuf");
+
+      // Emit the operation on the shuffled value.
       switch (RdxDesc.Kind) {
       case LoopVectorizationLegality::IntegerAdd:
-        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx");
+        TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx");
         break;
       case LoopVectorizationLegality::IntegerMult:
-        Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx");
+        TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx");
         break;
       case LoopVectorizationLegality::IntegerOr:
-        Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx");
+        TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx");
         break;
       case LoopVectorizationLegality::IntegerAnd:
-        Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx");
+        TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx");
         break;
       case LoopVectorizationLegality::IntegerXor:
-        Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx");
+        TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx");
         break;
       default:
         llvm_unreachable("Unknown reduction operation");
       }
     }
 
+    // The result is in the first element of the vector.
+    Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+
     // Now, we need to fix the users of the reduction variable
     // inside and outside of the scalar remainder loop.
     // We know that the loop is in LCSSA form. We need to update the
-- 
cgit v1.1


From 034b94b17006f51722886b0f2283fb6fb19aca1f Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Wed, 19 Dec 2012 07:18:57 +0000
Subject: Rename the 'Attributes' class to 'Attribute'. It's going to represent
 a single attribute in the future.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170502 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e3c76bb..ee544b3 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -95,7 +95,7 @@ struct LoopVectorize : public LoopPass {
     // Check the function attribues to find out if this function should be
     // optimized for size.
     Function *F = L->getHeader()->getParent();
-    Attributes::AttrVal SzAttr= Attributes::OptimizeForSize;
+    Attribute::AttrVal SzAttr= Attribute::OptimizeForSize;
     bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr);
 
     unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
-- 
cgit v1.1


From d3071bb042303ed239e5c770fd4026bfc339fcd0 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 20 Dec 2012 02:00:02 +0000
Subject: Loop Vectorizer: Enable if-conversion.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170632 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ee544b3..ddb6f84 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -44,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
 
 static cl::opt<bool>
-EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden,
+EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
 namespace {
-- 
cgit v1.1


From d5d46ace89d2b7ec1a54610925bc0f13bb394335 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 20 Dec 2012 17:42:53 +0000
Subject: Loop Vectorizer: turn-off if-conversion.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170708 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ddb6f84..ee544b3 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -44,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
 
 static cl::opt<bool>
-EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
 namespace {
-- 
cgit v1.1


From 8386acd7348003d13a2db3c4dd3274653d8ffe10 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 20 Dec 2012 20:24:40 +0000
Subject: LoopVectorize: Fix a bug in the scalarization of instructions. Before
 if-conversion we could check if a value is loop invariant if it was declared
 inside the basic block. Now that loops have multiple blocks this check is
 incorrect.

This fixes External/SPEC/CINT95/099_go/099_go



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170756 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ee544b3..827c13f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -284,7 +284,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
     // If the src is an instruction that appeared earlier in the basic block
     // then it should already be vectorized.
-    if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
+    if (SrcInst && OrigLoop->contains(SrcInst)) {
       assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
       // The parameter is a vector value from earlier.
       Params.push_back(WidenMap[SrcInst]);
-- 
cgit v1.1


From 55306bdea5d2d53be39f3ac59fadf5220ee6b5d0 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 21 Dec 2012 00:07:35 +0000
Subject: Fix a bug in the code that checks if we can vectorize loops while
 using dynamic memory bound checks.  Before the fix we were able to vectorize
 this loop from the Livermore Loops benchmark:

for ( k=1 ; k<n ; k++ )
  x[k] = x[k-1] + y[k];



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170811 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 43 +++++++++++++++++-------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 827c13f..4a90d78 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1593,8 +1593,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   ValueVector::iterator I, IE;
   for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
-    StoreInst *ST = dyn_cast<StoreInst>(*I);
-    assert(ST && "Bad StoreInst");
+    StoreInst *ST = cast<StoreInst>(*I);
     Value* Ptr = ST->getPointerOperand();
 
     if (isUniform(Ptr)) {
@@ -1609,8 +1608,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
-    LoadInst *LD = dyn_cast<LoadInst>(*I);
-    assert(LD && "Bad LoadInst");
+    LoadInst *LD = cast<LoadInst>(*I);
     Value* Ptr = LD->getPointerOperand();
     // If we did *not* see this pointer before, insert it to the
     // read list. If we *did* see it before, then it is already in
@@ -1633,13 +1631,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
-  bool RT = true;
+  bool CanDoRT = true;
   for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
     if (hasComputableBounds(*I)) {
       PtrRtCheck.insert(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
-      RT = false;
+      CanDoRT = false;
       break;
     }
   for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
@@ -1647,23 +1645,23 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       PtrRtCheck.insert(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
-      RT = false;
+      CanDoRT = false;
       break;
     }
 
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
-  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+  if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
     PtrRtCheck.reset();
-    RT = false;
+    CanDoRT = false;
   }
 
-  PtrRtCheck.Need = RT;
-
-  if (RT) {
+  if (CanDoRT) {
     DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
   }
 
+  bool NeedRTCheck = false;
+
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
@@ -1678,12 +1676,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
-        return RT;
+        NeedRTCheck = true;
       }
       if (!WriteObjects.insert(*it)) {
         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
               << **it <<"\n");
-        return RT;
+        return false;
       }
     }
     TempObjects.clear();
@@ -1696,20 +1694,27 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
-        return RT;
+        NeedRTCheck = true;
       }
       if (WriteObjects.count(*it)) {
         DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
               << **it <<"\n");
-        return RT;
+        return false;
       }
     }
     TempObjects.clear();
   }
 
-  // It is safe to vectorize and we don't need any runtime checks.
-  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
-  PtrRtCheck.reset();
+  PtrRtCheck.Need = NeedRTCheck;
+  if (NeedRTCheck && !CanDoRT) {
+    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
+          "the array bounds.\n");
+    PtrRtCheck.reset();
+    return false;
+  }
+
+  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
+        " need a runtime memory check.\n");
   return true;
 }
 
-- 
cgit v1.1


From ebf395d39ee81d1635783688dbc49ff86c24e7e6 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 21 Dec 2012 04:47:54 +0000
Subject: Enable if-conversion.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170841 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4a90d78..f533235 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -44,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
 
 static cl::opt<bool>
-EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden,
+EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
 namespace {
-- 
cgit v1.1


From 38b06020dbd804f01ee3802779a52c05cffdf87d Mon Sep 17 00:00:00 2001
From: Roman Divacky <rdivacky@freebsd.org>
Date: Fri, 21 Dec 2012 17:06:44 +0000
Subject: Remove duplicate includes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170902 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 1 -
 1 file changed, 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f533235..6f8c65a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/Verifier.h"
-- 
cgit v1.1


From 629fb82419d9bfff6ae475363bcce66192dfcc8e Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Sat, 22 Dec 2012 00:37:52 +0000
Subject: Change 'AttrVal' to 'AttrKind' to better reflect that it's a kind of
 attribute instead of the value of the attribute.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170972 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6f8c65a..f5ff79c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -94,7 +94,7 @@ struct LoopVectorize : public LoopPass {
     // Check the function attribues to find out if this function should be
     // optimized for size.
     Function *F = L->getHeader()->getParent();
-    Attribute::AttrVal SzAttr= Attribute::OptimizeForSize;
+    Attribute::AttrKind SzAttr= Attribute::OptimizeForSize;
     bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr);
 
     unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
-- 
cgit v1.1


From d54fed27865dcbc69932e1e6c372bb5a932e662a Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 23 Dec 2012 07:23:55 +0000
Subject: Loop Vectorizer: Update the cost model of scatter/gather operations
 and make them more expensive.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170995 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 42 +++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 13 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f5ff79c..5b1db0b 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2080,17 +2080,23 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     VectorTy = ToVectorTy(ValTy, VF);
 
     if (VF == 1)
-      return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
                                    SI->getAlignment(),
                                    SI->getPointerAddressSpace());
 
     // Scalarized stores.
     if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
       unsigned Cost = 0;
-      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
-                                            ValTy);
-      // The cost of extracting from the value vector.
-      Cost += VF * (ExtCost);
+
+      // The cost of extracting from the value vector and pointer vector.
+      Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+      for (unsigned i = 0; i < VF; ++i) {
+        Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement,
+                                         VectorTy, i);
+        Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement,
+                                         PtrTy, i);
+      }
+
       // The cost of the scalar stores.
       Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
                                          ValTy->getScalarType(),
@@ -2107,16 +2113,25 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     LoadInst *LI = cast<LoadInst>(I);
 
     if (VF == 1)
-      return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
                                    LI->getAlignment(),
                                    LI->getPointerAddressSpace());
 
     // Scalarized loads.
     if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
       unsigned Cost = 0;
-      unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
-      // The cost of inserting the loaded value into the result vector.
-      Cost += VF * (InCost);
+      Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+
+      // The cost of extracting from the pointer vector.
+      for (unsigned i = 0; i < VF; ++i)
+        Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement,
+                                         PtrTy, i);
+
+      // The cost of inserting data to the result vector.
+      for (unsigned i = 0; i < VF; ++i)
+        Cost += VTTI->getVectorInstrCost(Instruction::InsertElement,
+                                         VectorTy, i);
+
       // The cost of the scalar stores.
       Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
                                          RetTy->getScalarType(),
@@ -2169,18 +2184,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     bool IsVoid = RetTy->isVoidTy();
 
     unsigned InsCost = (IsVoid ? 0 :
-                        VTTI->getInstrCost(Instruction::InsertElement,
+                        VTTI->getVectorInstrCost(Instruction::InsertElement,
                                            VectorTy));
 
-    unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+    unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement,
                                           VectorTy);
 
     // The cost of inserting the results plus extracting each one of the
     // operands.
     Cost += VF * (InsCost + ExtCost * I->getNumOperands());
 
-    // The cost of executing VF copies of the scalar instruction.
-    Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
+    // The cost of executing VF copies of the scalar instruction. This opcode
+    // is unknown. Assume that it is the same as 'mul'.
+    Cost += VF * VTTI->getArithmeticInstrCost(Instruction::Mul, VectorTy);
     return Cost;
   }
   }// end of switch.
-- 
cgit v1.1


From 417872ed08128e8885b4c7fbaeb2d735c150ea57 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 23 Dec 2012 13:19:18 +0000
Subject: LoopVectorize: For scalars and void types there is no need to compute
 vector insert/extract costs.

Fixes an assert during the build of oggenc in the test suite.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171000 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5b1db0b..ddb7f26 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2181,18 +2181,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     // elements, times the vector width.
     unsigned Cost = 0;
 
-    bool IsVoid = RetTy->isVoidTy();
-
-    unsigned InsCost = (IsVoid ? 0 :
-                        VTTI->getVectorInstrCost(Instruction::InsertElement,
-                                           VectorTy));
-
-    unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement,
-                                          VectorTy);
-
-    // The cost of inserting the results plus extracting each one of the
-    // operands.
-    Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+    if (RetTy->isVoidTy() || VF != 1) {
+      unsigned InsCost = VTTI->getVectorInstrCost(Instruction::InsertElement,
+                                                  VectorTy);
+      unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement,
+                                                  VectorTy);
+
+      // The cost of inserting the results plus extracting each one of the
+      // operands.
+      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+    }
 
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
-- 
cgit v1.1


From a1acf55738a9d6c0376c7d36b27897817685e157 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 23 Dec 2012 13:21:41 +0000
Subject: LoopVectorize: Fix accidentaly inverted condition.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171001 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ddb7f26..1d78fac 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2181,7 +2181,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     // elements, times the vector width.
     unsigned Cost = 0;
 
-    if (RetTy->isVoidTy() || VF != 1) {
+    if (!RetTy->isVoidTy() && VF != 1) {
       unsigned InsCost = VTTI->getVectorInstrCost(Instruction::InsertElement,
                                                   VectorTy);
       unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement,
-- 
cgit v1.1


From 470ea9b72f87f2ce4bb09fc6f9829211a090652a Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 24 Dec 2012 01:22:06 +0000
Subject: LoopVectorizer: Fix an endless loop in the code that looks for
 reductions. The bug was in the code that detects PHIs in if-then-else block
 sequence.

PR14701.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171008 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1d78fac..20bcf86 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1737,10 +1737,9 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   Instruction *ExitInstruction = 0;
 
   // Iter is our iterator. We start with the PHI node and scan for all of the
-  // users of this instruction. All users must be instructions which can be
+  // users of this instruction. All users must be instructions that can be
   // used as reduction variables (such as ADD). We may have a single
-  // out-of-block user. They cycle must end with the original PHI.
-  // Also, we can't have multiple block-local users.
+  // out-of-block user. The cycle must end with the original PHI.
   Instruction *Iter = Phi;
   while (true) {
     // If the instruction has no users then this is a broken
@@ -1752,9 +1751,9 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     if (!isReductionInstr(Iter, Kind))
       return false;
 
-    // Did we find a user inside this block ?
+    // Did we find a user inside this loop already ?
     bool FoundInBlockUser = false;
-    // Did we reach the initial PHI node ?
+    // Did we reach the initial PHI node already ?
     bool FoundStartPHI = false;
 
     // For each of the *users* of iter.
@@ -1779,8 +1778,10 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       // We allow in-loop PHINodes which are not the original reduction PHI
       // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
       // structure) then don't skip this PHI.
-      if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() &&
-          TheLoop->contains(U) && Iter->getNumUses() > 1)
+      if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
+          U->getParent() != TheLoop->getHeader() &&
+          TheLoop->contains(U) &&
+          Iter->getNumUses() > 1)
         continue;
 
       // We can't have multiple inside users.
-- 
cgit v1.1


From 9e5329d77e590f757dbd8384f418e44df9dbf91a Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 24 Dec 2012 09:14:18 +0000
Subject: LoopVectorizer: When checking for vectorizable types, also check the
 StoreInst operands.

PR14705.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171023 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 20bcf86..d571903 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1464,13 +1464,20 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         return false;
       }
 
-      // We do not re-vectorize vectors.
+      // Check that the instruction return type is vectorizable.
       if (!VectorType::isValidElementType(it->getType()) &&
           !it->getType()->isVoidTy()) {
         DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
         return false;
       }
 
+      // Check that the stored type is vectorizable.
+      if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
+        Type *T = ST->getValueOperand()->getType();
+        if (!VectorType::isValidElementType(T))
+          return false;
+      }
+
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
       if (!AllowedExit.count(it))
-- 
cgit v1.1


From 1d59f5fa53cac23b6debc1d7214451c65b0399a7 Mon Sep 17 00:00:00 2001
From: Hal Finkel <hfinkel@anl.gov>
Date: Tue, 25 Dec 2012 23:21:29 +0000
Subject: LoopVectorize: Enable vectorization of the fmuladd intrinsic

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171076 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 1 +
 1 file changed, 1 insertion(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index d571903..b8b934a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -711,6 +711,7 @@ isTriviallyVectorizableIntrinsic(Instruction *Inst) {
   case Intrinsic::nearbyint:
   case Intrinsic::pow:
   case Intrinsic::fma:
+  case Intrinsic::fmuladd:
     return true;
   default:
     return false;
-- 
cgit v1.1


From 13eb1e7817be11ea84be6571dce827a77bc9640b Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 26 Dec 2012 19:08:17 +0000
Subject: LoopVectorizer: Optimize the vectorization of consecutive memory
 access when the iteration step is -1

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171114 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 85 ++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 22 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index b8b934a..d64295c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -202,7 +202,7 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
   return Builder.CreateAdd(Val, Cv, "induction");
 }
 
-bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
 
   // If this value is a pointer induction variable we know it is consecutive.
@@ -210,12 +210,12 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   if (Phi && Inductions.count(Phi)) {
     InductionInfo II = Inductions[Phi];
     if (PtrInduction == II.IK)
-      return true;
+      return 1;
   }
 
   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
   if (!Gep)
-    return false;
+    return 0;
 
   unsigned NumOperands = Gep->getNumOperands();
   Value *LastIndex = Gep->getOperand(NumOperands - 1);
@@ -223,7 +223,7 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   // Check that all of the gep indices are uniform except for the last.
   for (unsigned i = 0; i < NumOperands - 1; ++i)
     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
-      return false;
+      return 0;
 
   // We can emit wide load/stores only if the last index is the induction
   // variable.
@@ -234,10 +234,12 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     // The memory is consecutive because the last index is consecutive
     // and all other indices are loop invariant.
     if (Step->isOne())
-      return true;
+      return 1;
+    if (Step->isAllOnesValue())
+      return -1;
   }
 
-  return false;
+  return 0;
 }
 
 bool LoopVectorizationLegality::isUniform(Value *V) {
@@ -263,6 +265,17 @@ InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
   return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
 }
 
+Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
+  assert(Vec->getType()->isVectorTy() && "Invalid type");
+  SmallVector<Constant*, 8> ShuffleMask;
+  for (unsigned i = 0; i < VF; ++i)
+    ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
+
+  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
+                                     ConstantVector::get(ShuffleMask),
+                                     "reverse");
+}
+
 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
@@ -941,8 +954,7 @@ Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
 void
 InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
                                           BasicBlock *BB, PhiVector *PV) {
-  Constant *Zero =
-  ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0);
+  Constant *Zero = Builder.getInt32(0);
 
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
@@ -1142,14 +1154,15 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       assert(!Legal->isUniform(Ptr) &&
              "We do not allow storing to uniform addresses");
 
-      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
-      // This store does not use GEPs.
-      if (!Legal->isConsecutivePtr(Ptr)) {
+      int Stride = Legal->isConsecutivePtr(Ptr);
+      bool Reverse = Stride < 0;
+      if (Stride == 0) {
         scalarizeInstruction(it);
         break;
       }
 
+      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
       if (Gep) {
         // The last index does not have to be the induction. It can be
         // consecutive and be a function of the index. For example A[I+1];
@@ -1166,8 +1179,16 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
         Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
       }
+
+      // If the address is consecutive but reversed, then the
+      // wide load needs to start at the last vector element.
+      if (Reverse)
+        Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF));
+
       Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
       Value *Val = getVectorValue(SI->getValueOperand());
+      if (Reverse)
+        Val = reverseVector(Val);
       Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
       break;
     }
@@ -1177,16 +1198,17 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       Type *RetTy = VectorType::get(LI->getType(), VF);
       Value *Ptr = LI->getPointerOperand();
       unsigned Alignment = LI->getAlignment();
-      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
       // If the pointer is loop invariant or if it is non consecutive,
       // scalarize the load.
-      bool Con = Legal->isConsecutivePtr(Ptr);
-      if (Legal->isUniform(Ptr) || !Con) {
+      int Stride = Legal->isConsecutivePtr(Ptr);
+      bool Reverse = Stride < 0;
+      if (Legal->isUniform(Ptr) || Stride == 0) {
         scalarizeInstruction(it);
         break;
       }
 
+      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
       if (Gep) {
         // The last index does not have to be the induction. It can be
         // consecutive and be a function of the index. For example A[I+1];
@@ -1203,12 +1225,17 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
         Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
       }
+      // If the address is consecutive but reversed, then the
+      // wide load needs to start at the last vector element.
+      if (Reverse)
+        Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF));
 
       Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
       LI = Builder.CreateLoad(Ptr);
       LI->setAlignment(Alignment);
+
       // Use this vector value for all users of the load.
-      WidenMap[it] = LI;
+      WidenMap[it] = Reverse ? reverseVector(LI) :  LI;
       break;
     }
     case Instruction::ZExt:
@@ -1625,7 +1652,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     // If the address of i is unknown (for example A[B[i]]) then we may
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
-    if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr))
+    if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
       Reads.push_back(Ptr);
   }
 
@@ -2094,7 +2121,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                                    SI->getPointerAddressSpace());
 
     // Scalarized stores.
-    if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
+    int Stride = Legal->isConsecutivePtr(SI->getPointerOperand());
+    bool Reverse = Stride < 0;
+    if (0 == Stride) {
       unsigned Cost = 0;
 
       // The cost of extracting from the value vector and pointer vector.
@@ -2115,8 +2144,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     }
 
     // Wide stores.
-    return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
-                                 SI->getPointerAddressSpace());
+    unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
+                                          SI->getAlignment(),
+                                          SI->getPointerAddressSpace());
+    if (Reverse)
+      Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse,
+                                   VectorTy, 0);
+    return Cost;
   }
   case Instruction::Load: {
     LoadInst *LI = cast<LoadInst>(I);
@@ -2127,7 +2161,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                                    LI->getPointerAddressSpace());
 
     // Scalarized loads.
-    if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
+    int Stride = Legal->isConsecutivePtr(LI->getPointerOperand());
+    bool Reverse = Stride < 0;
+    if (0 == Stride) {
       unsigned Cost = 0;
       Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
 
@@ -2150,8 +2186,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     }
 
     // Wide loads.
-    return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
-                                 LI->getPointerAddressSpace());
+    unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
+                                          LI->getAlignment(),
+                                          LI->getPointerAddressSpace());
+    if (Reverse)
+      Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse,
+                                   VectorTy, 0);
+    return Cost;
   }
   case Instruction::ZExt:
   case Instruction::SExt:
-- 
cgit v1.1


From 5dd839430c1dbce6cd35dc44f68718a1fc69bfba Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 26 Dec 2012 23:30:53 +0000
Subject: If all of the write objects are identified then we can vectorize the
 loop even if the read objects are unidentified.

PR14719.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171124 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index d64295c..7fb9bba 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1704,6 +1704,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   // Check that the read-writes do not conflict with other read-write
   // pointers.
+  bool AllWritesIdentified = true;
   for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
     GetUnderlyingObjects(*I, TempObjects, DL);
     for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
@@ -1711,6 +1712,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
         NeedRTCheck = true;
+        AllWritesIdentified = false;
       }
       if (!WriteObjects.insert(*it)) {
         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
@@ -1726,7 +1728,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     GetUnderlyingObjects(*I, TempObjects, DL);
     for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
          it != e; ++it) {
-      if (!isIdentifiedObject(*it)) {
+      // If all of the writes are identified then we don't care if the read
+      // pointer is identified or not.
+      if (!AllWritesIdentified && !isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
         NeedRTCheck = true;
       }
-- 
cgit v1.1


From db2367512e87dbd7b93c3250ef30c9df5e40cb43 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sun, 30 Dec 2012 07:47:00 +0000
Subject: LoopVectorizer: Fix a bug in the code that updates the loop exiting
 block. LCSSA PHIs may have undef values. The vectorizer updates values that
 are used by outside users such as PHIs. The bug happened because undefs are
 not loop values. This patch handles these PHIs.

PR14725



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171251 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7fb9bba..653c111 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -909,6 +909,19 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
+
+  // The Loop exit block may have single value PHI nodes where the incoming
+  // value is 'undef'. While vectorizing we only handled real values that
+  // were defined inside the loop. Here we handle the 'undef case'.
+  // See PR14725.
+  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+       LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+    PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+    if (!LCSSAPhi) continue;
+    if (LCSSAPhi->getNumIncomingValues() == 1)
+      LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
+                            LoopMiddleBlock);
+  }
 }
 
 Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
-- 
cgit v1.1


From 831737d329a727f53a1fb0572f7b7a8127208881 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Sun, 30 Dec 2012 10:32:01 +0000
Subject: Remove the Function::getFnAttributes method in favor of using the
 AttributeSet directly.

This is in preparation for removing the use of the 'Attribute' class as a
collection of attributes. That will shift to the AttributeSet class instead.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171253 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 653c111..adf9081 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -95,7 +95,8 @@ struct LoopVectorize : public LoopPass {
     // optimized for size.
     Function *F = L->getHeader()->getParent();
     Attribute::AttrKind SzAttr= Attribute::OptimizeForSize;
-    bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr);
+    bool OptForSize =
+      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, SzAttr);
 
     unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
 
-- 
cgit v1.1


From 6c3074958370bf25dc6e4e4b757f0c083e245dbe Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Tue, 1 Jan 2013 19:55:16 +0000
Subject: Add IRBuilder::CreateVectorSplat and use it to simplify code.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171349 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index adf9081..aadc134 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -150,11 +150,6 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
 }
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Create the types.
-  LLVMContext &C = V->getContext();
-  Type *VTy = VectorType::get(V->getType(), VF);
-  Type *I32 = IntegerType::getInt32Ty(C);
-
   // Save the current insertion location.
   Instruction *Loc = Builder.GetInsertPoint();
 
@@ -167,14 +162,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   if (Invariant)
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
 
-  Constant *Zero = ConstantInt::get(I32, 0);
-  Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
-  Value *UndefVal = UndefValue::get(VTy);
-  // Insert the value into a new vector.
-  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
   // Broadcast the scalar into all locations in the vector.
-  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
-                                            "broadcast");
+  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
 
   // Restore the builder insertion point.
   if (Invariant)
-- 
cgit v1.1


From 0b8c9a80f20772c3793201ab5b251d3520b9cea3 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Wed, 2 Jan 2013 11:36:10 +0000
Subject: Move all of the header files which are involved in modelling the LLVM
 IR into their new header subdirectory: include/llvm/IR. This matches the
 directory structure of lib, and begins to correct a long standing point of
 file layout clutter in LLVM.

There are still more header files to move here, but I wanted to handle
them in separate commits to make tracking what files make sense at each
layer easier.

The only really questionable files here are the target intrinsic
tablegen files. But that's a battle I'd rather not fight today.

I've updated both CMake and Makefile build systems (I think, and my
tests think, but I may have missed something).

I've also re-sorted the includes throughout the project. I'll be
committing updates to Clang, DragonEgg, and Polly momentarily.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171366 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index aadc134..f200452 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -18,14 +18,16 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/Constants.h"
-#include "llvm/DataLayout.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -35,8 +37,6 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
-#include "llvm/Type.h"
-#include "llvm/Value.h"
 
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
-- 
cgit v1.1


From 00a6bcaeb4a74b930bc2b21f8021c3358d507ecf Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 2 Jan 2013 23:54:43 +0000
Subject: Avoid vectorization when the function has the "noimplicitflot"
 attribute.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171429 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f200452..9b1d398 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -94,9 +94,17 @@ struct LoopVectorize : public LoopPass {
     // Check the function attribues to find out if this function should be
     // optimized for size.
     Function *F = L->getHeader()->getParent();
-    Attribute::AttrKind SzAttr= Attribute::OptimizeForSize;
-    bool OptForSize =
-      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, SzAttr);
+    Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
+    Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
+    unsigned FnIndex = AttributeSet::FunctionIndex;
+    bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
+    bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
+
+    if (NoFloat) {
+      DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+            "attribute is used.\n");
+      return false;
+    }
 
     unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
 
-- 
cgit v1.1


From e4159491a7d94f87f99fb99a15c76d5d7b26851c Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 3 Jan 2013 00:52:27 +0000
Subject: LoopVectorizer: Add support for loop-unrolling during vectorization
 for increasing the ILP. At the moment this feature is disabled by default and
 this commit should not cause any functional changes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171436 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 422 ++++++++++++++++++-----------
 1 file changed, 267 insertions(+), 155 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9b1d398..8feea93 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -42,6 +42,11 @@ static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
 
+static cl::opt<unsigned>
+VectorizationUnroll("force-vector-unroll", cl::init(1), cl::Hidden,
+                    cl::desc("Sets the vectorization unroll count. "
+                             "Zero is autoselect."));
+
 static cl::opt<bool>
 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
@@ -117,7 +122,7 @@ struct LoopVectorize : public LoopPass {
           F->getParent()->getModuleIdentifier()<<"\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF);
+    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, VectorizationUnroll);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -180,7 +185,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
+                                                 bool Negate) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
   assert(Val->getType()->getScalarType()->isIntegerTy() &&
          "Elem must be an integer");
@@ -191,8 +197,10 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
   SmallVector<Constant*, 8> Indices;
 
   // Create a vector of consecutive numbers from zero to VF.
-  for (int i = 0; i < VLen; ++i)
-    Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i ));
+  for (int i = 0; i < VLen; ++i) {
+    int Idx = Negate ? (-i): i;
+    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
+  }
 
   // Add the consecutive indices to the vector value.
   Constant *Cv = ConstantVector::get(Indices);
@@ -244,18 +252,20 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
   return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
 }
 
-Value *InnerLoopVectorizer::getVectorValue(Value *V) {
+InnerLoopVectorizer::VectorParts&
+InnerLoopVectorizer::getVectorValue(Value *V) {
   assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
-  // If we saved a vectorized copy of V, use it.
-  Value *&MapEntry = WidenMap[V];
-  if (MapEntry)
-    return MapEntry;
 
-  // Broadcast V and save the value for future uses.
+  // If we have this scalar in the map, return it.
+  if (WidenMap.has(V))
+    return WidenMap.get(V);
+
+  // If this scalar is unknown, assume that it is a constant or that it is
+  // loop invariant. Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
-  MapEntry = B;
-  return B;
+  WidenMap.splat(V, B);
+  return WidenMap.get(V);
 }
 
 Constant*
@@ -277,7 +287,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
-  SmallVector<Value*, 8> Params;
+  SmallVector<VectorParts, 4> Params;
 
   // Find all of the vectorized parameters.
   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
@@ -295,12 +305,14 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
     // If the src is an instruction that appeared earlier in the basic block
     // then it should already be vectorized.
     if (SrcInst && OrigLoop->contains(SrcInst)) {
-      assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
+      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
       // The parameter is a vector value from earlier.
-      Params.push_back(WidenMap[SrcInst]);
+      Params.push_back(WidenMap.get(SrcInst));
     } else {
       // The parameter is a scalar from outside the loop. Maybe even a constant.
-      Params.push_back(SrcOp);
+      VectorParts Scalars;
+      Scalars.append(UF, SrcOp);
+      Params.push_back(Scalars);
     }
   }
 
@@ -309,39 +321,38 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
-  Value *VecResults = 0;
 
-  // If we have a return value, create an empty vector. We place the scalarized
-  // instructions in this vector.
-  if (!IsVoidRetTy)
-    VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
+  Value *UndefVec = IsVoidRetTy ? 0 :
+    UndefValue::get(VectorType::get(Instr->getType(), VF));
+  // Create a new entry in the WidenMap and initialize it to Undef or Null.
+  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
   // For each scalar that we create:
-  for (unsigned i = 0; i < VF; ++i) {
-    Instruction *Cloned = Instr->clone();
-    if (!IsVoidRetTy)
-      Cloned->setName(Instr->getName() + ".cloned");
-    // Replace the operands of the cloned instrucions with extracted scalars.
-    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-      Value *Op = Params[op];
-      // Param is a vector. Need to extract the right lane.
-      if (Op->getType()->isVectorTy())
-        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
-      Cloned->setOperand(op, Op);
-    }
+  for (unsigned Width = 0; Width < VF; ++Width) {
+    // For each vector unroll 'part':
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Instruction *Cloned = Instr->clone();
+      if (!IsVoidRetTy)
+        Cloned->setName(Instr->getName() + ".cloned");
+      // Replace the operands of the cloned instrucions with extracted scalars.
+      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+        Value *Op = Params[op][Part];
+        // Param is a vector. Need to extract the right lane.
+        if (Op->getType()->isVectorTy())
+          Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
+        Cloned->setOperand(op, Op);
+      }
 
-    // Place the cloned scalar in the new loop.
-    Builder.Insert(Cloned);
+      // Place the cloned scalar in the new loop.
+      Builder.Insert(Cloned);
 
-    // If the original scalar returns a value we need to place it in a vector
-    // so that future users will be able to use it.
-    if (!IsVoidRetTy)
-      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
-                                               Builder.getInt32(i));
+      // If the original scalar returns a value we need to place it in a vector
+      // so that future users will be able to use it.
+      if (!IsVoidRetTy)
+        VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
+                                                       Builder.getInt32(Width));
+    }
   }
-
-  if (!IsVoidRetTy)
-    WidenMap[Instr] = VecResults;
 }
 
 Value*
@@ -503,7 +514,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Generate the induction variable.
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
-  Constant *Step = ConstantInt::get(IdxTy, VF);
+  // The loop step is equal to the vectorization factor (num of SIMD elements)
+  // times the unroll factor (num of SIMD instructions).
+  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
 
   // We may need to extend the index in case there is a type mismatch.
   // We know that the count starts at zero and does not overflow.
@@ -521,8 +534,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Now we need to generate the expression for N - (N % VF), which is
   // the part that the vectorized body will execute.
-  Constant *CIVF = ConstantInt::get(IdxTy, VF);
-  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
+  Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc);
   Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
   Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
                                                      "end.idx.rnd.down", Loc);
@@ -775,7 +787,6 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
        it != e; ++it) {
     PHINode *RdxPhi = *it;
-    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
     assert(RdxPhi && "Unable to recover vectorized PHI");
 
     // Find the reduction variable descriptor.
@@ -791,8 +802,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
 
     // This is the vector-clone of the value that leaves the loop.
-    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
-    Type *VecTy = VectorExit->getType();
+    VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
+    Type *VecTy = VectorExit[0]->getType();
 
     // Find the reduction identity variable. Zero for addition, or, xor,
     // one for multiplication, -1 for And.
@@ -811,10 +822,17 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Reductions do not have to start at zero. They can start with
     // any loop invariant values.
-    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
-    Value *Val =
-    getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-    VecRdxPhi->addIncoming(Val, LoopVectorBody);
+    VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
+    BasicBlock *Latch = OrigLoop->getLoopLatch();
+    Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
+    VectorParts &Val = getVectorValue(LoopVal);
+    for (unsigned part = 0; part < UF; ++part) {
+      // Make sure to add the reduction stat value only to the 
+      // first unroll part.
+      Value *StartVal = (part == 0) ? VectorStart : Identity;
+      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
+      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
+    }
 
     // Before each round, move the insertion point right between
     // the PHIs and the values we are going to write.
@@ -822,18 +840,54 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // instructions.
     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
 
-    // This PHINode contains the vectorized reduction variable, or
-    // the initial value vector, if we bypass the vector loop.
-    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
-    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
+    VectorParts RdxParts;
+    for (unsigned part = 0; part < UF; ++part) {
+      // This PHINode contains the vectorized reduction variable, or
+      // the initial value vector, if we bypass the vector loop.
+      VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
+      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+      Value *StartVal = (part == 0) ? VectorStart : Identity;
+      NewPhi->addIncoming(StartVal, LoopBypassBlock);
+      NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
+      RdxParts.push_back(NewPhi);
+    }
+
+    // Reduce all of the unrolled parts into a single vector.
+    Value *ReducedPartRdx = RdxParts[0];
+    for (unsigned part = 1; part < UF; ++part) {
+      switch (RdxDesc.Kind) {
+      case LoopVectorizationLegality::IntegerAdd:
+        ReducedPartRdx = 
+          Builder.CreateAdd(RdxParts[part], ReducedPartRdx, "add.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerMult:
+        ReducedPartRdx =
+          Builder.CreateMul(RdxParts[part], ReducedPartRdx, "mul.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerOr:
+        ReducedPartRdx =
+          Builder.CreateOr(RdxParts[part], ReducedPartRdx, "or.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerAnd:
+        ReducedPartRdx =
+          Builder.CreateAnd(RdxParts[part], ReducedPartRdx, "and.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerXor:
+        ReducedPartRdx =
+          Builder.CreateXor(RdxParts[part], ReducedPartRdx, "xor.rdx");
+        break;
+      default:
+        llvm_unreachable("Unknown reduction operation");
+      }
+    }
+    
 
     // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
     // and vector ops, reducing the set of values being computed by half each
     // round.
     assert(isPowerOf2_32(VF) &&
            "Reduction emission only supported for pow2 vectors!");
-    Value *TmpVec = NewPhi;
+    Value *TmpVec = ReducedPartRdx;
     SmallVector<Constant*, 32> ShuffleMask(VF, 0);
     for (unsigned i = VF; i != 1; i >>= 1) {
       // Move the upper half of the vector to the lower half.
@@ -922,27 +976,34 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   }
 }
 
-Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
          "Invalid edge");
 
-  Value *SrcMask = createBlockInMask(Src);
+  VectorParts SrcMask = createBlockInMask(Src);
 
   // The terminator has to be a branch inst!
   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
   assert(BI && "Unexpected terminator found");
 
-  Value *EdgeMask = SrcMask;
   if (BI->isConditional()) {
-    EdgeMask = getVectorValue(BI->getCondition());
+    VectorParts EdgeMask = getVectorValue(BI->getCondition());
+
     if (BI->getSuccessor(0) != Dst)
-      EdgeMask = Builder.CreateNot(EdgeMask);
+      for (unsigned part = 0; part < UF; ++part)
+        EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
+
+    for (unsigned part = 0; part < UF; ++part)
+      EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
+    return EdgeMask;
   }
 
-  return Builder.CreateAnd(EdgeMask, SrcMask);
+  return SrcMask;
 }
 
-Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
 
   // Loop incoming mask is all-one.
@@ -953,11 +1014,14 @@ Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
 
   // This is the block mask. We OR all incoming edges, and with zero.
   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
-  Value *BlockMask = getVectorValue(Zero);
+  VectorParts BlockMask = getVectorValue(Zero);
 
   // For each pred:
-  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it)
-    BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB));
+  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
+    VectorParts EM = createEdgeMask(*it, BB);
+    for (unsigned part = 0; part < UF; ++part)
+      BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
+  }
 
   return BlockMask;
 }
@@ -969,6 +1033,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
 
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    VectorParts &Entry = WidenMap.get(it);
     switch (it->getOpcode()) {
     case Instruction::Br:
       // Nothing to do for PHIs and BR, since we already took care of the
@@ -978,11 +1043,12 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       PHINode* P = cast<PHINode>(it);
       // Handle reduction variables:
       if (Legal->getReductionVars()->count(P)) {
-        // This is phase one of vectorizing PHIs.
-        Type *VecTy = VectorType::get(it->getType(), VF);
-        WidenMap[it] =
-          PHINode::Create(VecTy, 2, "vec.phi",
-                          LoopVectorBody->getFirstInsertionPt());
+        for (unsigned part = 0; part < UF; ++part) {
+          // This is phase one of vectorizing PHIs.
+          Type *VecTy = VectorType::get(it->getType(), VF);
+          Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
+                                        LoopVectorBody-> getFirstInsertionPt());
+        }
         PV->push_back(P);
         continue;
       }
@@ -996,12 +1062,15 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // At this point we generate the predication tree. There may be
         // duplications since this is a simple recursive scan, but future
         // optimizations will clean it up.
-        Value *Cond = createEdgeMask(P->getIncomingBlock(0), P->getParent());
-        WidenMap[P] =
-          Builder.CreateSelect(Cond,
-                               getVectorValue(P->getIncomingValue(0)),
-                               getVectorValue(P->getIncomingValue(1)),
-                               "predphi");
+        VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
+                                               P->getParent());
+        
+        for (unsigned part = 0; part < UF; ++part) {
+        VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
+        VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part],
+                                             "predphi");
+        }
         continue;
       }
 
@@ -1021,8 +1090,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         Value *Broadcasted = getBroadcastInstrs(Induction);
         // After broadcasting the induction variable we need to make the
         // vector consecutive by adding 0, 1, 2 ...
-        Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
-        WidenMap[OldInduction] = ConsecutiveInduction;
+        for (unsigned part = 0; part < UF; ++part)
+          Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
         continue;
       }
       case LoopVectorizationLegality::ReverseIntInduction:
@@ -1054,9 +1123,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
           Value *Broadcasted = getBroadcastInstrs(ReverseInd);
           // After broadcasting the induction variable we need to make the
           // vector consecutive by adding  ... -3, -2, -1, 0.
-          Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted,
-                                                             true);
-          WidenMap[it] = ConsecutiveInduction;
+          for (unsigned part = 0; part < UF; ++part)
+            Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true);
           continue;
         }
 
@@ -1065,19 +1133,21 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
 
         // This is the vector of results. Notice that we don't generate
         // vector geps because scalar geps result in better code.
-        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
-        for (unsigned int i = 0; i < VF; ++i) {
-          Constant *Idx = ConstantInt::get(Induction->getType(), i);
-          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
-                                               "gep.idx");
-          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
-                                             "next.gep");
-          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
-                                               Builder.getInt32(i),
-                                               "insert.gep");
+        for (unsigned part = 0; part < UF; ++part) {
+          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+          for (unsigned int i = 0; i < VF; ++i) {
+            Constant *Idx = ConstantInt::get(Induction->getType(),
+                                             i + part * VF);
+            Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
+                                                 "gep.idx");
+            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                               "next.gep");
+            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                                 Builder.getInt32(i),
+                                                 "insert.gep");
+          }
+          Entry[part] = VecVal;
         }
-
-        WidenMap[it] = VecVal;
         continue;
       }
 
@@ -1103,41 +1173,48 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
     case Instruction::Xor: {
       // Just widen binops.
       BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
-      Value *A = getVectorValue(it->getOperand(0));
-      Value *B = getVectorValue(it->getOperand(1));
+      VectorParts &A = getVectorValue(it->getOperand(0));
+      VectorParts &B = getVectorValue(it->getOperand(1));
 
       // Use this vector value for all users of the original instruction.
-      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
-      WidenMap[it] = V;
-
-      // Update the NSW, NUW and Exact flags.
-      BinaryOperator *VecOp = cast<BinaryOperator>(V);
-      if (isa<OverflowingBinaryOperator>(BinOp)) {
-        VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
-        VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+
+        // Update the NSW, NUW and Exact flags.
+        BinaryOperator *VecOp = cast<BinaryOperator>(V);
+        if (isa<OverflowingBinaryOperator>(BinOp)) {
+          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+        }
+        if (isa<PossiblyExactOperator>(VecOp))
+          VecOp->setIsExact(BinOp->isExact());
+
+        Entry[Part] = V;
       }
-      if (isa<PossiblyExactOperator>(VecOp))
-        VecOp->setIsExact(BinOp->isExact());
       break;
     }
     case Instruction::Select: {
       // Widen selects.
       // If the selector is loop invariant we can create a select
       // instruction with a scalar condition. Otherwise, use vector-select.
-      Value *Cond = it->getOperand(0);
-      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
+      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
+                                               OrigLoop);
 
       // The condition can be loop invariant  but still defined inside the
       // loop. This means that we can't just use the original 'cond' value.
       // We have to take the 'vectorized' value and pick the first lane.
       // Instcombine will make this a no-op.
-      Cond = getVectorValue(Cond);
-      if (InvariantCond)
-        Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
-
-      Value *Op0 = getVectorValue(it->getOperand(1));
-      Value *Op1 = getVectorValue(it->getOperand(2));
-      WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1);
+      VectorParts &Cond = getVectorValue(it->getOperand(0));
+      VectorParts &Op0  = getVectorValue(it->getOperand(1));
+      VectorParts &Op1  = getVectorValue(it->getOperand(2));
+      Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
+                                                       Builder.getInt32(0));
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Entry[Part] = Builder.CreateSelect(
+          InvariantCond ? ScalarCond : Cond[Part],
+          Op0[Part],
+          Op1[Part]);
+      }
       break;
     }
 
@@ -1146,12 +1223,16 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       // Widen compares. Generate vector compares.
       bool FCmp = (it->getOpcode() == Instruction::FCmp);
       CmpInst *Cmp = dyn_cast<CmpInst>(it);
-      Value *A = getVectorValue(it->getOperand(0));
-      Value *B = getVectorValue(it->getOperand(1));
-      if (FCmp)
-        WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
-      else
-        WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+      VectorParts &A = getVectorValue(it->getOperand(0));
+      VectorParts &B = getVectorValue(it->getOperand(1));
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *C = 0;
+        if (FCmp)
+          C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
+        else
+          C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
+        Entry[Part] = C;
+      }
       break;
     }
 
@@ -1173,12 +1254,17 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         break;
       }
 
+      // Handle consecutive stores.
+
       GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
       if (Gep) {
         // The last index does not have to be the induction. It can be
         // consecutive and be a function of the index. For example A[I+1];
         unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+
+        Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
+        VectorParts &GEPParts = getVectorValue(LastGepOperand);
+        Value *LastIndex = GEPParts[0];
         LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
 
         // Create the new GEP with the new induction variable.
@@ -1188,19 +1274,28 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       } else {
         // Use the induction element ptr.
         assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        VectorParts &PtrVal = getVectorValue(Ptr);
+        Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
       }
 
-      // If the address is consecutive but reversed, then the
-      // wide load needs to start at the last vector element.
-      if (Reverse)
-        Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF));
+      VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        // Calculate the pointer for the specific unroll-part.
+        Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+        if (Reverse) {
+          // If we store to reverse consecutive memory locations then we need
+          // to reverse the order of elements in the stored value.
+          StoredVal[Part] = reverseVector(StoredVal[Part]);
+          // If the address is consecutive but reversed, then the
+          // wide store needs to start at the last vector element.
+          PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+          PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+        }
 
-      Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
-      Value *Val = getVectorValue(SI->getValueOperand());
-      if (Reverse)
-        Val = reverseVector(Val);
-      Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
+        Value *VecPtr = Builder.CreateBitCast(PartPtr, StTy->getPointerTo());
+        Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
+      }
       break;
     }
     case Instruction::Load: {
@@ -1224,7 +1319,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // The last index does not have to be the induction. It can be
         // consecutive and be a function of the index. For example A[I+1];
         unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+
+        Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
+        VectorParts &GEPParts = getVectorValue(LastGepOperand);
+        Value *LastIndex = GEPParts[0];
         LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
 
         // Create the new GEP with the new induction variable.
@@ -1234,19 +1332,26 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       } else {
         // Use the induction element ptr.
         assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        VectorParts &PtrVal = getVectorValue(Ptr);
+        Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
       }
-      // If the address is consecutive but reversed, then the
-      // wide load needs to start at the last vector element.
-      if (Reverse)
-        Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF));
 
-      Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
-      LI = Builder.CreateLoad(Ptr);
-      LI->setAlignment(Alignment);
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        // Calculate the pointer for the specific unroll-part.
+        Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
 
-      // Use this vector value for all users of the load.
-      WidenMap[it] = Reverse ? reverseVector(LI) :  LI;
+        if (Reverse) {
+          // If the address is consecutive but reversed, then the
+          // wide store needs to start at the last vector element.
+          PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+          PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+        }
+
+        Value *VecPtr = Builder.CreateBitCast(PartPtr, RetTy->getPointerTo());
+        Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
+        cast<LoadInst>(LI)->setAlignment(Alignment);
+        Entry[Part] = Reverse ? reverseVector(LI) :  LI;
+      }
       break;
     }
     case Instruction::ZExt:
@@ -1271,13 +1376,16 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
                                                CI->getType());
         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
-        WidenMap[it] = getConsecutiveVector(Broadcasted);
+        for (unsigned Part = 0; Part < UF; ++Part)
+          Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
         break;
       }
       /// Vectorize casts.
-      Value *A = getVectorValue(it->getOperand(0));
       Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-      WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+
+      VectorParts &A = getVectorValue(it->getOperand(0));
+      for (unsigned Part = 0; Part < UF; ++Part)
+        Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
       break;
     }
 
@@ -1286,12 +1394,16 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       Module *M = BB->getParent()->getParent();
       IntrinsicInst *II = cast<IntrinsicInst>(it);
       Intrinsic::ID ID = II->getIntrinsicID();
-      SmallVector<Value*, 4> Args;
-      for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
-        Args.push_back(getVectorValue(II->getArgOperand(i)));
-      Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
-      Function *F = Intrinsic::getDeclaration(M, ID, Tys);
-      WidenMap[it] = Builder.CreateCall(F, Args);
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        SmallVector<Value*, 4> Args;
+        for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) {
+          VectorParts &Arg = getVectorValue(II->getArgOperand(i));
+          Args.push_back(Arg[Part]);
+        }
+        Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
+        Function *F = Intrinsic::getDeclaration(M, ID, Tys);
+        Entry[Part] = Builder.CreateCall(F, Args);
+      }
       break;
     }
 
-- 
cgit v1.1


From e503319874f57ab4a0354521b03a71cf8e07b866 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 4 Jan 2013 17:48:25 +0000
Subject: LoopVectorizer:

1. Add code to estimate register pressure.
2. Add code to select the unroll factor based on register pressure.
3. Add bits to TargetTransformInfo to provide the number of registers.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171469 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 166 ++++++++++++++++++++++++++++-
 1 file changed, 162 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8feea93..0f84fe0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "LoopVectorize.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
@@ -43,7 +44,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
 
 static cl::opt<unsigned>
-VectorizationUnroll("force-vector-unroll", cl::init(1), cl::Hidden,
+VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
                     cl::desc("Sets the vectorization unroll count. "
                              "Zero is autoselect."));
 
@@ -94,7 +95,7 @@ struct LoopVectorize : public LoopPass {
     if (TTI)
       VTTI = TTI->getVectorTargetTransformInfo();
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, VTTI);
 
     // Check the function attribues to find out if this function should be
     // optimized for size.
@@ -112,6 +113,7 @@ struct LoopVectorize : public LoopPass {
     }
 
     unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll);
 
     if (VF == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
@@ -120,9 +122,10 @@ struct LoopVectorize : public LoopPass {
 
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
           F->getParent()->getModuleIdentifier()<<"\n");
+    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, VectorizationUnroll);
+    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -2082,7 +2085,7 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
 
 unsigned
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
-                                                        unsigned UserVF) {
+                                                      unsigned UserVF) {
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
     return 1;
@@ -2148,6 +2151,161 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
   return Width;
 }
 
+unsigned
+LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
+                                               unsigned UserUF) {
+  // Use the user preference, unless 'auto' is selected.
+  if (UserUF != 0)
+    return UserUF;
+
+  // When we optimize for size we don't unroll.
+  if (OptForSize)
+    return 1;
+
+  unsigned TargetVectorRegisters = VTTI->getNumberOfRegisters(true);
+  DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
+        " vector registers\n");
+
+  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
+  // We divide by these constants so assume that we have at least one
+  // instruction that uses at least one register.
+  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+  R.NumInstructions = std::max(R.NumInstructions, 1U);
+
+  // We calculate the unroll factor using the following formula.
+  // Subtract the number of loop invariants from the number of available
+  // registers. These registers are used by all of the unrolled instances.
+  // Next, divide the remaining registers by the number of registers that is
+  // required by the loop, in order to estimate how many parallel instances
+  // fit without causing spills.
+  unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
+
+  // We don't want to unroll the loops to the point where they do not fit into
+  // the decoded cache. Assume that we only allow 32 IR instructions.
+  UF = std::min(UF, (32 / R.NumInstructions));
+
+  // Clamp the unroll factor ranges to reasonable factors.
+  if (UF > MaxUnrollSize)
+    UF = MaxUnrollSize;
+  else if (UF < 1)
+    UF = 1;
+
+  return UF;
+}
+
+LoopVectorizationCostModel::RegisterUsage
+LoopVectorizationCostModel::calculateRegisterUsage() {
+  // This function calculates the register usage by measuring the highest number
+  // of values that are alive at a single location. Obviously, this is a very
+  // rough estimation. We scan the loop in a topological order in order and
+  // assign a number to each instruction. We use RPO to ensure that defs are
+  // met before their users. We assume that each instruction that has in-loop
+  // users starts an interval. We record every time that an in-loop value is
+  // used, so we have a list of the first and last occurrences of each
+  // instruction. Next, we transpose this data structure into a multi map that
+  // holds the list of intervals that *end* at a specific location. This multi
+  // map allows us to perform a linear search. We scan the instructions linearly
+  // and record each time that a new interval starts, by placing it in a set.
+  // If we find this value in the multi-map then we remove it from the set.
+  // The max register usage is the maximum size of the set.
+  // We also search for instructions that are defined outside the loop, but are
+  // used inside the loop. We need this number separately from the max-interval
+  // usage number because when we unroll, loop-invariant values do not take
+  // more register.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+
+  RegisterUsage R;
+  R.NumInstructions = 0;
+
+  // Each 'key' in the map opens a new interval. The values
+  // of the map are the index of the 'last seen' usage of the
+  // instruction that is the key.
+  typedef DenseMap<Instruction*, unsigned> IntervalMap;
+  // Maps instruction to its index.
+  DenseMap<unsigned, Instruction*> IdxToInstr;
+  // Marks the end of each interval.
+  IntervalMap EndPoint;
+  // Saves the list of instruction indices that are used in the loop.
+  SmallSet<Instruction*, 8> Ends;
+  // Saves the list of values that are used in the loop but are
+  // defined outside the loop, such as arguments and constants.
+  SmallPtrSet<Value*, 8> LoopInvariants;
+
+  unsigned Index = 0;
+  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+       be = DFS.endRPO(); bb != be; ++bb) {
+    R.NumInstructions += (*bb)->size();
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      Instruction *I = it;
+      IdxToInstr[Index++] = I;
+
+      // Save the end location of each USE.
+      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+        Value *U = I->getOperand(i);
+        Instruction *Instr = dyn_cast<Instruction>(U);
+
+        // Ignore non-instruction values such as arguments, constants, etc.
+        if (!Instr) continue;
+
+        // If this instruction is outside the loop then record it and continue.
+        if (!TheLoop->contains(Instr)) {
+          LoopInvariants.insert(Instr);
+          continue;
+        }
+
+        // Overwrite previous end points.
+        EndPoint[Instr] = Index;
+        Ends.insert(Instr);
+      }
+    }
+  }
+
+  // Saves the list of intervals that end with the index in 'key'.
+  typedef SmallVector<Instruction*, 2> InstrList;
+  DenseMap<unsigned, InstrList> TransposeEnds;
+
+  // Transpose the EndPoints to a list of values that end at each index.
+  for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
+       it != e; ++it)
+    TransposeEnds[it->second].push_back(it->first);
+
+  SmallSet<Instruction*, 8> OpenIntervals;
+  unsigned MaxUsage = 0;
+
+
+  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+  for (unsigned int i = 0; i < Index; ++i) {
+    Instruction *I = IdxToInstr[i];
+    // Ignore instructions that are never used within the loop.
+    if (!Ends.count(I)) continue;
+
+    // Remove all of the instructions that end at this location.
+    InstrList &List = TransposeEnds[i];
+    for (unsigned int i=0, e = List.size(); i < e; ++i)
+      OpenIntervals.erase(List[i]);
+
+    // Count the number of live interals.
+    MaxUsage = std::max(MaxUsage, OpenIntervals.size());
+
+    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
+          OpenIntervals.size() <<"\n");
+
+    // Add the current instruction to the list of open intervals.
+    OpenIntervals.insert(I);
+  }
+
+  unsigned Invariant = LoopInvariants.size();
+  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
+  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
+  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
+
+  R.LoopInvariantRegs = Invariant;
+  R.MaxLocalUsers = MaxUsage;
+  return R;
+}
+
 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
   unsigned Cost = 0;
 
-- 
cgit v1.1


From 608467d1e1b318220307143d3af81f31cd7a18f8 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 4 Jan 2013 21:08:44 +0000
Subject: Fix a warning

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171525 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0f84fe0..4e50898 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2283,8 +2283,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
 
     // Remove all of the instructions that end at this location.
     InstrList &List = TransposeEnds[i];
-    for (unsigned int i=0, e = List.size(); i < e; ++i)
-      OpenIntervals.erase(List[i]);
+    for (unsigned int j=0, e = List.size(); j < e; ++j)
+      OpenIntervals.erase(List[j]);
 
     // Count the number of live interals.
     MaxUsage = std::max(MaxUsage, OpenIntervals.size());
-- 
cgit v1.1


From 5767d919569f7b4be51c0a13bff24f722625d9f3 Mon Sep 17 00:00:00 2001
From: Paul Redmond <paul.redmond@intel.com>
Date: Fri, 4 Jan 2013 22:10:16 +0000
Subject: Do not vectorize loops with subtraction reductions

Since subtraction does not commute the loop vectorizer incorrectly vectorizes
reductions such as x = A[i] - x.

Disabling for now.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171537 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 1 -
 1 file changed, 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4e50898..5e2d797 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1986,7 +1986,6 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     // possibly.
     return true;
   case Instruction::Add:
-  case Instruction::Sub:
     return Kind == IntegerAdd;
   case Instruction::Mul:
     return Kind == IntegerMult;
-- 
cgit v1.1


From d5b92c389133c5d587e4094af553ec345ed40045 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 5 Jan 2013 01:15:47 +0000
Subject: iLoopVectorize: Non commutative operators can be used as reduction
 variables as long as the reduction chain is used in the LHS.

PR14803.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171583 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5e2d797..af2e846 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1912,10 +1912,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     if (Iter->use_empty())
       return false;
 
-    // Any reduction instr must be of one of the allowed kinds.
-    if (!isReductionInstr(Iter, Kind))
-      return false;
-
     // Did we find a user inside this loop already ?
     bool FoundInBlockUser = false;
     // Did we reach the initial PHI node already ?
@@ -1953,6 +1949,16 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       if (FoundInBlockUser)
         return false;
       FoundInBlockUser = true;
+
+      // Any reduction instr must be of one of the allowed kinds.
+      if (!isReductionInstr(U, Kind))
+        return false;
+
+      // Reductions of instructions such as Div, and Sub is only
+      // possible if the LHS is the reduction variable.
+      if (!U->isCommutative() && U->getOperand(0) != Iter)
+        return false;
+
       Iter = U;
     }
 
@@ -1985,8 +1991,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
   case Instruction::PHI:
     // possibly.
     return true;
+  case Instruction::Sub:
   case Instruction::Add:
     return Kind == IntegerAdd;
+  case Instruction::SDiv:
+  case Instruction::UDiv:
   case Instruction::Mul:
     return Kind == IntegerMult;
   case Instruction::And:
-- 
cgit v1.1


From be73c7b903647221fbcaae302d31e90f53583040 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Sat, 5 Jan 2013 10:16:02 +0000
Subject: Switch the loop vectorizer from VTTI to just use TTI directly.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171620 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 94 ++++++++++++++----------------
 1 file changed, 45 insertions(+), 49 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index af2e846..1f46b91 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -90,12 +90,8 @@ struct LoopVectorize : public LoopPass {
       return false;
     }
 
-    // Select the preffered vectorization factor.
-    const VectorTargetTransformInfo *VTTI = 0;
-    if (TTI)
-      VTTI = TTI->getVectorTargetTransformInfo();
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, VTTI);
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, TTI);
 
     // Check the function attribues to find out if this function should be
     // optimized for size.
@@ -2134,7 +2130,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
     return UserVF;
   }
 
-  if (!VTTI) {
+  if (!TTI) {
     DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
     return 1;
   }
@@ -2170,7 +2166,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   if (OptForSize)
     return 1;
 
-  unsigned TargetVectorRegisters = VTTI->getNumberOfRegisters(true);
+  unsigned TargetVectorRegisters = TTI->getNumberOfRegisters(true);
   DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
         " vector registers\n");
 
@@ -2345,7 +2341,7 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
 unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
-  assert(VTTI && "Invalid vector target transformation info");
+  assert(TTI && "Invalid vector target transformation info");
 
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
@@ -2363,7 +2359,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     // generate vector geps.
     return 0;
   case Instruction::Br: {
-    return VTTI->getCFInstrCost(I->getOpcode());
+    return TTI->getCFInstrCost(I->getOpcode());
   }
   case Instruction::PHI:
     //TODO: IF-converted IFs become selects.
@@ -2386,7 +2382,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
-    return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
+    return TTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
@@ -2395,13 +2391,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     if (ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
 
-    return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
     VectorTy = ToVectorTy(ValTy, VF);
-    return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
   }
   case Instruction::Store: {
     StoreInst *SI = cast<StoreInst>(I);
@@ -2409,7 +2405,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     VectorTy = ToVectorTy(ValTy, VF);
 
     if (VF == 1)
-      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
+      return TTI->getMemoryOpCost(I->getOpcode(), VectorTy,
                                    SI->getAlignment(),
                                    SI->getPointerAddressSpace());
 
@@ -2422,36 +2418,36 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       // The cost of extracting from the value vector and pointer vector.
       Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
       for (unsigned i = 0; i < VF; ++i) {
-        Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement,
-                                         VectorTy, i);
-        Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement,
-                                         PtrTy, i);
+        Cost += TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                        VectorTy, i);
+        Cost += TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                        PtrTy, i);
       }
 
       // The cost of the scalar stores.
-      Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                         ValTy->getScalarType(),
+      Cost += VF * TTI->getMemoryOpCost(I->getOpcode(),
+                                        ValTy->getScalarType(),
                                          SI->getAlignment(),
                                          SI->getPointerAddressSpace());
       return Cost;
     }
 
     // Wide stores.
-    unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
-                                          SI->getAlignment(),
-                                          SI->getPointerAddressSpace());
+    unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy,
+                                         SI->getAlignment(),
+                                         SI->getPointerAddressSpace());
     if (Reverse)
-      Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse,
-                                   VectorTy, 0);
+      Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse,
+                                  VectorTy, 0);
     return Cost;
   }
   case Instruction::Load: {
     LoadInst *LI = cast<LoadInst>(I);
 
     if (VF == 1)
-      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
-                                   LI->getAlignment(),
-                                   LI->getPointerAddressSpace());
+      return TTI->getMemoryOpCost(I->getOpcode(), VectorTy,
+                                  LI->getAlignment(),
+                                  LI->getPointerAddressSpace());
 
     // Scalarized loads.
     int Stride = Legal->isConsecutivePtr(LI->getPointerOperand());
@@ -2462,29 +2458,29 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
 
       // The cost of extracting from the pointer vector.
       for (unsigned i = 0; i < VF; ++i)
-        Cost += VTTI->getVectorInstrCost(Instruction::ExtractElement,
-                                         PtrTy, i);
+        Cost += TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                        PtrTy, i);
 
       // The cost of inserting data to the result vector.
       for (unsigned i = 0; i < VF; ++i)
-        Cost += VTTI->getVectorInstrCost(Instruction::InsertElement,
-                                         VectorTy, i);
+        Cost += TTI->getVectorInstrCost(Instruction::InsertElement,
+                                        VectorTy, i);
 
       // The cost of the scalar stores.
-      Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                         RetTy->getScalarType(),
-                                         LI->getAlignment(),
-                                         LI->getPointerAddressSpace());
+      Cost += VF * TTI->getMemoryOpCost(I->getOpcode(),
+                                        RetTy->getScalarType(),
+                                        LI->getAlignment(),
+                                        LI->getPointerAddressSpace());
       return Cost;
     }
 
     // Wide loads.
-    unsigned Cost = VTTI->getMemoryOpCost(I->getOpcode(), VectorTy,
-                                          LI->getAlignment(),
-                                          LI->getPointerAddressSpace());
+    unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy,
+                                         LI->getAlignment(),
+                                         LI->getPointerAddressSpace());
     if (Reverse)
-      Cost += VTTI->getShuffleCost(VectorTargetTransformInfo::Reverse,
-                                   VectorTy, 0);
+      Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse,
+                                  VectorTy, 0);
     return Cost;
   }
   case Instruction::ZExt:
@@ -2503,11 +2499,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     // The cost of these is the same as the scalar operation.
     if (I->getOpcode() == Instruction::Trunc &&
         Legal->isInductionVariable(I->getOperand(0)))
-         return VTTI->getCastInstrCost(I->getOpcode(), I->getType(),
-                                       I->getOperand(0)->getType());
+         return TTI->getCastInstrCost(I->getOpcode(), I->getType(),
+                                      I->getOperand(0)->getType());
 
     Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-    return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    return TTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
   }
   case Instruction::Call: {
     assert(isTriviallyVectorizableIntrinsic(I));
@@ -2516,7 +2512,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     SmallVector<Type*, 4> Tys;
     for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
       Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
-    return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
+    return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
   }
   default: {
     // We are scalarizing the instruction. Return the cost of the scalar
@@ -2525,10 +2521,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     unsigned Cost = 0;
 
     if (!RetTy->isVoidTy() && VF != 1) {
-      unsigned InsCost = VTTI->getVectorInstrCost(Instruction::InsertElement,
-                                                  VectorTy);
-      unsigned ExtCost = VTTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  VectorTy);
+      unsigned InsCost = TTI->getVectorInstrCost(Instruction::InsertElement,
+                                                 VectorTy);
+      unsigned ExtCost = TTI->getVectorInstrCost(Instruction::ExtractElement,
+                                                 VectorTy);
 
       // The cost of inserting the results plus extracting each one of the
       // operands.
@@ -2537,7 +2533,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
 
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
-    Cost += VF * VTTI->getArithmeticInstrCost(Instruction::Mul, VectorTy);
+    Cost += VF * TTI->getArithmeticInstrCost(Instruction::Mul, VectorTy);
     return Cost;
   }
   }// end of switch.
-- 
cgit v1.1


From be04929f7fd76a921540e9901f24563e51dc1219 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 7 Jan 2013 03:08:10 +0000
Subject: Move TargetTransformInfo to live under the Analysis library. This no
 longer would violate any dependency layering and it is in fact an analysis.
 =]

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171686 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1f46b91..6a26bef 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/IR/Constants.h"
@@ -33,7 +34,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetTransformInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-- 
cgit v1.1


From bb00800ff46e7a2a628d0a6741a7f0422c74c198 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 7 Jan 2013 03:20:02 +0000
Subject: Fix the enumerator names for ShuffleKind to match tho coding
 standards, and make its comments doxygen comments.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171688 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6a26bef..17d9eb1 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2437,7 +2437,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                                          SI->getAlignment(),
                                          SI->getPointerAddressSpace());
     if (Reverse)
-      Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse,
+      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_Reverse,
                                   VectorTy, 0);
     return Cost;
   }
@@ -2479,7 +2479,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                                          LI->getAlignment(),
                                          LI->getPointerAddressSpace());
     if (Reverse)
-      Cost += TTI->getShuffleCost(TargetTransformInfo::Reverse,
+      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_Reverse,
                                   VectorTy, 0);
     return Cost;
   }
-- 
cgit v1.1


From f3252b12e02b1fcf01abf0a79b761c53de5985d0 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 7 Jan 2013 10:44:06 +0000
Subject: Merge the unused header file for LoopVectorizer into the source file.
 This makes the loop vectorizer match the pattern followed by roughly all
 other passses. =]

Notably, this header file was braken in several regards: it contained
a using namespace directive, global #define's that aren't globaly
appropriate, and global constants defined directly in the header file.

As a side benefit, lots of the types in this file become internal, which
will cause the optimizer to chew on this pass more effectively.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171723 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 522 ++++++++++++++++++++++++++++-
 1 file changed, 519 insertions(+), 3 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 17d9eb1..d51114e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6,8 +6,51 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-#include "LoopVectorize.h"
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR. Legalization of the IR is done
+// in the codegen. However, the vectorizes uses (will use) the codegen
+// interfaces to generate IR that is likely to result in an optimal binary.
+//
+// The loop vectorizer combines consecutive loop iteration into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+//    of the vectorization.
+// 3. InnerLoopVectorizer - A unit that performs the actual
+//    widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+//    of vectorization. It decides on the optimal vector width, which
+//    can be one, if vectorization is not profitable.
+//
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// Other ideas/concepts are from:
+//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
@@ -15,6 +58,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -24,6 +68,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -37,7 +82,10 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <map>
+
+using namespace llvm;
 
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
@@ -52,8 +100,476 @@ static cl::opt<bool>
 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
+/// We don't vectorize loops with a known constant trip count below this number.
+static const unsigned TinyTripCountThreshold = 16;
+
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+static const unsigned RuntimeMemoryCheckThreshold = 4;
+
+/// This is the highest vector width that we try to generate.
+static const unsigned MaxVectorSize = 8;
+
+/// This is the highest Unroll Factor.
+static const unsigned MaxUnrollSize = 4;
+
 namespace {
 
+// Forward declarations.
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+///   counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+///   instructions.
+/// InnerLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The InnerLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class InnerLoopVectorizer {
+public:
+  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
+                      DominatorTree *DT, DataLayout *DL, unsigned VecWidth,
+                      unsigned UnrollFactor)
+      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth),
+        UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
+        OldInduction(0), WidenMap(UnrollFactor) {}
+
+  // Perform the actual loop widening (vectorization).
+  void vectorize(LoopVectorizationLegality *Legal) {
+    // Create a new empty loop. Unlink the old loop and connect the new one.
+    createEmptyLoop(Legal);
+    // Widen each instruction in the old loop to a new one in the new loop.
+    // Use the Legality module to find the induction and reduction variables.
+    vectorizeLoop(Legal);
+    // Register the new loop and update the analysis passes.
+    updateAnalysis();
+  }
+
+private:
+  /// A small list of PHINodes.
+  typedef SmallVector<PHINode*, 4> PhiVector;
+  /// When we unroll loops we have multiple vector values for each scalar.
+  /// This data structure holds the unrolled and vectorized values that
+  /// originated from one scalar instruction.
+  typedef SmallVector<Value*, 2> VectorParts;
+
+  /// Add code that checks at runtime if the accessed arrays overlap.
+  /// Returns the comparator value or NULL if no check is needed.
+  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                         Instruction *Loc);
+  /// Create an empty loop, based on the loop ranges of the old loop.
+  void createEmptyLoop(LoopVectorizationLegality *Legal);
+  /// Copy and widen the instructions from the old loop.
+  void vectorizeLoop(LoopVectorizationLegality *Legal);
+
+  /// A helper function that computes the predicate of the block BB, assuming
+  /// that the header block of the loop is set to True. It returns the *entry*
+  /// mask for the block BB.
+  VectorParts createBlockInMask(BasicBlock *BB);
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
+  /// A helper function to vectorize a single BB within the innermost loop.
+  void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
+                            PhiVector *PV);
+
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
+  void updateAnalysis();
+
+  /// This instruction is un-vectorizable. Implement it as a sequence
+  /// of scalars.
+  void scalarizeInstruction(Instruction *Instr);
+
+  /// Create a broadcast instruction. This method generates a broadcast
+  /// instruction (shuffle) for loop invariant values and for the induction
+  /// value. If this is the induction variable then we extend it to N, N+1, ...
+  /// this is needed because each iteration in the loop corresponds to a SIMD
+  /// element.
+  Value *getBroadcastInstrs(Value *V);
+
+  /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
+  /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
+  /// The sequence starts at StartIndex.
+  Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
+
+  /// When we go over instructions in the basic block we rely on previous
+  /// values within the current basic block or on loop invariant values.
+  /// When we widen (vectorize) values we place them in the map. If the values
+  /// are not within the map, they have to be loop invariant, so we simply
+  /// broadcast them into a vector.
+  VectorParts &getVectorValue(Value *V);
+
+  /// Get a uniform vector of constant integers. We use this to get
+  /// vectors of ones and zeros for the reduction code.
+  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
+
+  /// Generate a shuffle sequence that will reverse the vector Vec.
+  Value *reverseVector(Value *Vec);
+
+  /// This is a helper class that holds the vectorizer state. It maps scalar
+  /// instructions to vector instructions. When the code is 'unrolled' then
+  /// then a single scalar value is mapped to multiple vector parts. The parts
+  /// are stored in the VectorPart type.
+  struct ValueMap {
+    /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
+    /// are mapped.
+    ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
+
+    /// \return True if 'Key' is saved in the Value Map.
+    bool has(Value *Key) { return MapStoreage.count(Key); }
+
+    /// Initializes a new entry in the map. Sets all of the vector parts to the
+    /// save value in 'Val'.
+    /// \return A reference to a vector with splat values.
+    VectorParts &splat(Value *Key, Value *Val) {
+      MapStoreage[Key].clear();
+      MapStoreage[Key].append(UF, Val);
+      return MapStoreage[Key];
+    }
+
+    ///\return A reference to the value that is stored at 'Key'.
+    VectorParts &get(Value *Key) {
+      if (!has(Key))
+        MapStoreage[Key].resize(UF);
+      return MapStoreage[Key];
+    }
+
+    /// The unroll factor. Each entry in the map stores this number of vector
+    /// elements.
+    unsigned UF;
+
+    /// Map storage. We use std::map and not DenseMap because insertions to a
+    /// dense map invalidates its iterators.
+    std::map<Value*, VectorParts> MapStoreage;
+  };
+
+  /// The original loop.
+  Loop *OrigLoop;
+  /// Scev analysis to use.
+  ScalarEvolution *SE;
+  /// Loop Info.
+  LoopInfo *LI;
+  /// Dominator Tree.
+  DominatorTree *DT;
+  /// Data Layout.
+  DataLayout *DL;
+  /// The vectorization SIMD factor to use. Each vector will have this many
+  /// vector elements.
+  unsigned VF;
+  /// The vectorization unroll factor to use. Each scalar is vectorized to this
+  /// many different vector instructions.
+  unsigned UF;
+
+  /// The builder that we use
+  IRBuilder<> Builder;
+
+  // --- Vectorization state ---
+
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+  ///The ExitBlock of the scalar loop.
+  BasicBlock *LoopExitBlock;
+  ///The vector loop body.
+  BasicBlock *LoopVectorBody;
+  ///The scalar loop body.
+  BasicBlock *LoopScalarBody;
+  ///The first bypass block.
+  BasicBlock *LoopBypassBlock;
+
+  /// The new Induction variable which was added to the new block.
+  PHINode *Induction;
+  /// The induction variable of the old basic block.
+  PHINode *OldInduction;
+  /// Maps scalars to widened vectors.
+  ValueMap WidenMap;
+};
+
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+///   will change the order of memory accesses in a way that will change the
+///   correctness of the program.
+/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
+/// checks for a number of different conditions, such as the availability of a
+/// single induction variable, that all types are supported and vectorize-able,
+/// etc. This code reflects the capabilities of InnerLoopVectorizer.
+/// This class is also used by InnerLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
+class LoopVectorizationLegality {
+public:
+  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
+                            DominatorTree *DT)
+      : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {}
+
+  /// This enum represents the kinds of reductions that we support.
+  enum ReductionKind {
+    NoReduction, ///< Not a reduction.
+    IntegerAdd,  ///< Sum of numbers.
+    IntegerMult, ///< Product of numbers.
+    IntegerOr,   ///< Bitwise or logical OR of numbers.
+    IntegerAnd,  ///< Bitwise or logical AND of numbers.
+    IntegerXor   ///< Bitwise or logical XOR of numbers.
+  };
+
+  /// This enum represents the kinds of inductions that we support.
+  enum InductionKind {
+    NoInduction,         ///< Not an induction variable.
+    IntInduction,        ///< Integer induction variable. Step = 1.
+    ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
+    PtrInduction         ///< Pointer induction variable. Step = sizeof(elem).
+  };
+
+  /// This POD struct holds information about reduction variables.
+  struct ReductionDescriptor {
+    ReductionDescriptor() : StartValue(0), LoopExitInstr(0), Kind(NoReduction) {
+    }
+
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
+        : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+    // The starting value of the reduction.
+    // It does not have to be zero!
+    Value *StartValue;
+    // The instruction who's value is used outside the loop.
+    Instruction *LoopExitInstr;
+    // The kind of the reduction.
+    ReductionKind Kind;
+  };
+
+  // This POD struct holds information about the memory runtime legality
+  // check that a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    RuntimePointerCheck() : Need(false) {}
+
+    /// Reset the state of the pointer runtime information.
+    void reset() {
+      Need = false;
+      Pointers.clear();
+      Starts.clear();
+      Ends.clear();
+    }
+
+    /// Insert a pointer and calculate the start and end SCEVs.
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<Value*, 2> Pointers;
+    /// Holds the pointer value at the beginning of the loop.
+    SmallVector<const SCEV*, 2> Starts;
+    /// Holds the pointer value at the end of the loop.
+    SmallVector<const SCEV*, 2> Ends;
+  };
+
+  /// A POD for saving information about induction variables.
+  struct InductionInfo {
+    InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
+    InductionInfo() : StartValue(0), IK(NoInduction) {}
+    /// Start value.
+    Value *StartValue;
+    /// Induction kind.
+    InductionKind IK;
+  };
+
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
+
+  /// InductionList saves induction variables and maps them to the
+  /// induction descriptor.
+  typedef MapVector<PHINode*, InductionInfo> InductionList;
+
+  /// Returns true if it is legal to vectorize this loop.
+  /// This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so.
+  bool canVectorize();
+
+  /// Returns the Induction variable.
+  PHINode *getInduction() { return Induction; }
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
+  /// Returns True if V is an induction variable in this loop.
+  bool isInductionVariable(const Value *V);
+
+  /// Return true if the block BB needs to be predicated in order for the loop
+  /// to be vectorized.
+  bool blockNeedsPredication(BasicBlock *BB);
+
+  /// Check if this  pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
+  /// This check allows us to vectorize A[idx] into a wide load/store.
+  /// Returns:
+  /// 0 - Stride is unknown or non consecutive.
+  /// 1 - Address is consecutive.
+  /// -1 - Address is consecutive, and decreasing.
+  int isConsecutivePtr(Value *Ptr);
+
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
+  /// Returns true if this instruction will remain scalar after vectorization.
+  bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
+
+  /// Returns the information that we collected about runtime memory check.
+  RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
+private:
+  /// Check if a single basic block loop is vectorizable.
+  /// At this point we know that this is a loop with a constant trip count
+  /// and we only need to check individual instructions.
+  bool canVectorizeInstrs();
+
+  /// When we vectorize loops we may change the order in which
+  /// we read and write from memory. This method checks if it is
+  /// legal to vectorize the code, considering only memory constrains.
+  /// Returns true if the loop is vectorizable
+  bool canVectorizeMemory();
+
+  /// Return true if we can vectorize this loop using the IF-conversion
+  /// transformation.
+  bool canVectorizeWithIfConvert();
+
+  /// Collect the variables that need to stay uniform after vectorization.
+  void collectLoopUniforms();
+
+  /// Return true if all of the instructions in the block can be speculatively
+  /// executed.
+  bool blockCanBePredicated(BasicBlock *BB);
+
+  /// Returns True, if 'Phi' is the kind of reduction variable for type
+  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
+  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
+  /// Returns true if the instruction I can be a reduction variable of type
+  /// 'Kind'.
+  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns the induction kind of Phi. This function may return NoInduction
+  /// if the PHI is not an induction variable.
+  InductionKind isInductionVariable(PHINode *Phi);
+  /// Return true if can compute the address bounds of Ptr within the loop.
+  bool hasComputableBounds(Value *Ptr);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  /// DataLayout analysis.
+  DataLayout *DL;
+  // Dominators.
+  DominatorTree *DT;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the integer induction variable. This is the counter of the
+  /// loop.
+  PHINode *Induction;
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
+  /// Allowed outside users. This holds the reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value*, 4> AllowedExit;
+  /// This set holds the variables which are known to be uniform after
+  /// vectorization.
+  SmallPtrSet<Instruction*, 4> Uniforms;
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
+class LoopVectorizationCostModel {
+public:
+  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
+                             LoopVectorizationLegality *Legal,
+                             const TargetTransformInfo *TTI)
+      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {}
+
+  /// \return The most profitable vectorization factor.
+  /// This method checks every power of two up to VF. If UserVF is not ZERO
+  /// then this vectorization factor will be selected if vectorization is
+  /// possible.
+  unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
+
+
+  /// \return The most profitable unroll factor.
+  /// If UserUF is non-zero then this method finds the best unroll-factor
+  /// based on register pressure and other parameters.
+  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
+
+  /// \brief A struct that represents some properties of the register usage
+  /// of a loop.
+  struct RegisterUsage {
+    /// Holds the number of loop invariant values that are used in the loop.
+    unsigned LoopInvariantRegs;
+    /// Holds the maximum number of concurrent live intervals in the loop.
+    unsigned MaxLocalUsers;
+    /// Holds the number of instructions in the loop.
+    unsigned NumInstructions;
+  };
+
+  /// \return  information about the register usage of the loop.
+  RegisterUsage calculateRegisterUsage();
+
+private:
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width.
+  unsigned expectedCost(unsigned VF);
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+  /// A helper function for converting Scalar types to vector types.
+  /// If the incoming type is void, we return void. If the VF is 1, we return
+  /// the scalar type.
+  static Type* ToVectorTy(Type *Scalar, unsigned VF);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  /// Loop Info analysis.
+  LoopInfo *LI;
+  /// Vectorization legality.
+  LoopVectorizationLegality *Legal;
+  /// Vector target information.
+  const TargetTransformInfo *TTI;
+};
+
 /// The LoopVectorize Pass.
 struct LoopVectorize : public LoopPass {
   /// Pass identification, replacement for typeid
@@ -141,7 +657,7 @@ struct LoopVectorize : public LoopPass {
 
 };
 
-}// namespace
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
-- 
cgit v1.1


From 1cbeaeb1944f6fd1ab0997201a47ea6c23e9a979 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 7 Jan 2013 11:12:29 +0000
Subject: Simplify LoopVectorize to require target transform info and rely on
 it being present. Make a member of one of the helper classes a reference as
 part of this.

Reformatting goodness brought to you by clang-format.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171726 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 98 +++++++++++++-----------------
 1 file changed, 43 insertions(+), 55 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index d51114e..2c1af1d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -513,7 +513,7 @@ class LoopVectorizationCostModel {
 public:
   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
                              LoopVectorizationLegality *Legal,
-                             const TargetTransformInfo *TTI)
+                             const TargetTransformInfo &TTI)
       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {}
 
   /// \return The most profitable vectorization factor.
@@ -567,7 +567,7 @@ private:
   /// Vectorization legality.
   LoopVectorizationLegality *Legal;
   /// Vector target information.
-  const TargetTransformInfo *TTI;
+  const TargetTransformInfo &TTI;
 };
 
 /// The LoopVectorize Pass.
@@ -593,7 +593,7 @@ struct LoopVectorize : public LoopPass {
     SE = &getAnalysis<ScalarEvolution>();
     DL = getAnalysisIfAvailable<DataLayout>();
     LI = &getAnalysis<LoopInfo>();
-    TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+    TTI = &getAnalysis<TargetTransformInfo>();
     DT = &getAnalysis<DominatorTree>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
@@ -607,7 +607,7 @@ struct LoopVectorize : public LoopPass {
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, TTI);
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI);
 
     // Check the function attribues to find out if this function should be
     // optimized for size.
@@ -648,9 +648,10 @@ struct LoopVectorize : public LoopPass {
     LoopPass::getAnalysisUsage(AU);
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
+    AU.addRequired<DominatorTree>();
     AU.addRequired<LoopInfo>();
     AU.addRequired<ScalarEvolution>();
-    AU.addRequired<DominatorTree>();
+    AU.addRequired<TargetTransformInfo>();
     AU.addPreserved<LoopInfo>();
     AU.addPreserved<DominatorTree>();
   }
@@ -2646,11 +2647,6 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
     return UserVF;
   }
 
-  if (!TTI) {
-    DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
-    return 1;
-  }
-
   float Cost = expectedCost(1);
   unsigned Width = 1;
   DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
@@ -2682,7 +2678,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   if (OptForSize)
     return 1;
 
-  unsigned TargetVectorRegisters = TTI->getNumberOfRegisters(true);
+  unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
   DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
         " vector registers\n");
 
@@ -2857,8 +2853,6 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
 unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
-  assert(TTI && "Invalid vector target transformation info");
-
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (Legal->isUniformAfterVectorization(I))
@@ -2875,7 +2869,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     // generate vector geps.
     return 0;
   case Instruction::Br: {
-    return TTI->getCFInstrCost(I->getOpcode());
+    return TTI.getCFInstrCost(I->getOpcode());
   }
   case Instruction::PHI:
     //TODO: IF-converted IFs become selects.
@@ -2898,7 +2892,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
-    return TTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
@@ -2907,13 +2901,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     if (ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
 
-    return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
     VectorTy = ToVectorTy(ValTy, VF);
-    return TTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
   }
   case Instruction::Store: {
     StoreInst *SI = cast<StoreInst>(I);
@@ -2921,7 +2915,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     VectorTy = ToVectorTy(ValTy, VF);
 
     if (VF == 1)
-      return TTI->getMemoryOpCost(I->getOpcode(), VectorTy,
+      return TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
                                    SI->getAlignment(),
                                    SI->getPointerAddressSpace());
 
@@ -2934,26 +2928,24 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       // The cost of extracting from the value vector and pointer vector.
       Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
       for (unsigned i = 0; i < VF; ++i) {
-        Cost += TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                        VectorTy, i);
-        Cost += TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                        PtrTy, i);
+        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
+                                       i);
+        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
       }
 
       // The cost of the scalar stores.
-      Cost += VF * TTI->getMemoryOpCost(I->getOpcode(),
-                                        ValTy->getScalarType(),
-                                         SI->getAlignment(),
-                                         SI->getPointerAddressSpace());
+      Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+                                       SI->getAlignment(),
+                                       SI->getPointerAddressSpace());
       return Cost;
     }
 
     // Wide stores.
-    unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy,
-                                         SI->getAlignment(),
-                                         SI->getPointerAddressSpace());
+    unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
+                                        SI->getAlignment(),
+                                        SI->getPointerAddressSpace());
     if (Reverse)
-      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_Reverse,
+      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
                                   VectorTy, 0);
     return Cost;
   }
@@ -2961,9 +2953,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     LoadInst *LI = cast<LoadInst>(I);
 
     if (VF == 1)
-      return TTI->getMemoryOpCost(I->getOpcode(), VectorTy,
-                                  LI->getAlignment(),
-                                  LI->getPointerAddressSpace());
+      return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
+                                 LI->getPointerAddressSpace());
 
     // Scalarized loads.
     int Stride = Legal->isConsecutivePtr(LI->getPointerOperand());
@@ -2974,29 +2965,25 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
 
       // The cost of extracting from the pointer vector.
       for (unsigned i = 0; i < VF; ++i)
-        Cost += TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                        PtrTy, i);
+        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
 
       // The cost of inserting data to the result vector.
       for (unsigned i = 0; i < VF; ++i)
-        Cost += TTI->getVectorInstrCost(Instruction::InsertElement,
-                                        VectorTy, i);
+        Cost += TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy, i);
 
       // The cost of the scalar stores.
-      Cost += VF * TTI->getMemoryOpCost(I->getOpcode(),
-                                        RetTy->getScalarType(),
-                                        LI->getAlignment(),
-                                        LI->getPointerAddressSpace());
+      Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(),
+                                       LI->getAlignment(),
+                                       LI->getPointerAddressSpace());
       return Cost;
     }
 
     // Wide loads.
-    unsigned Cost = TTI->getMemoryOpCost(I->getOpcode(), VectorTy,
-                                         LI->getAlignment(),
-                                         LI->getPointerAddressSpace());
+    unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
+                                        LI->getAlignment(),
+                                        LI->getPointerAddressSpace());
     if (Reverse)
-      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                  VectorTy, 0);
+      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
     return Cost;
   }
   case Instruction::ZExt:
@@ -3015,11 +3002,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     // The cost of these is the same as the scalar operation.
     if (I->getOpcode() == Instruction::Trunc &&
         Legal->isInductionVariable(I->getOperand(0)))
-         return TTI->getCastInstrCost(I->getOpcode(), I->getType(),
-                                      I->getOperand(0)->getType());
+      return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
+                                  I->getOperand(0)->getType());
 
     Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-    return TTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
   }
   case Instruction::Call: {
     assert(isTriviallyVectorizableIntrinsic(I));
@@ -3028,7 +3015,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     SmallVector<Type*, 4> Tys;
     for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
       Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
-    return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
+    return TTI.getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
   }
   default: {
     // We are scalarizing the instruction. Return the cost of the scalar
@@ -3037,10 +3024,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     unsigned Cost = 0;
 
     if (!RetTy->isVoidTy() && VF != 1) {
-      unsigned InsCost = TTI->getVectorInstrCost(Instruction::InsertElement,
-                                                 VectorTy);
-      unsigned ExtCost = TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                 VectorTy);
+      unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
+                                                VectorTy);
+      unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                                VectorTy);
 
       // The cost of inserting the results plus extracting each one of the
       // operands.
@@ -3049,7 +3036,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
 
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
-    Cost += VF * TTI->getArithmeticInstrCost(Instruction::Mul, VectorTy);
+    Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
     return Cost;
   }
   }// end of switch.
@@ -3065,6 +3052,7 @@ char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
-- 
cgit v1.1


From 9a6c6a373629fb5a3cc5afd08aafa51339df95df Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 7 Jan 2013 21:54:51 +0000
Subject: LoopVectorizer: When we vectorizer and widen loops we process many
 elements at once. This is a good thing, except for small loops. On small
 loops post-loop that handles scalars (and runs slower) can take more time to
 execute than the rest of the loop. This patch disables widening of loops with
 a small static trip count.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171798 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2c1af1d..b266d9d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -101,7 +101,13 @@ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
 /// We don't vectorize loops with a known constant trip count below this number.
-static const unsigned TinyTripCountThreshold = 16;
+static const unsigned TinyTripCountVectorThreshold = 16;
+
+/// We don't unroll loops with a known constant trip count below this number.
+static const unsigned TinyTripCountUnrollThreshold = 128;
+
+/// We don't unroll loops that are larget than this threshold.
+static const unsigned MaxLoopSizeThreshold = 32;
 
 /// When performing a runtime memory check, do not check more than this
 /// number of pointers. Notice that the check is quadratic!
@@ -2016,7 +2022,7 @@ bool LoopVectorizationLegality::canVectorize() {
 
   // Do not loop-vectorize loops with a tiny trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
-  if (TC > 0u && TC < TinyTripCountThreshold) {
+  if (TC > 0u && TC < TinyTripCountVectorThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
           "This loop is not worth vectorizing.\n");
     return false;
@@ -2678,6 +2684,12 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   if (OptForSize)
     return 1;
 
+  // Do not unroll loops with a relatively small trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop,
+                                              TheLoop->getLoopLatch());
+  if (TC > 1 && TC < TinyTripCountUnrollThreshold)
+    return 1;
+
   unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
   DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
         " vector registers\n");
@@ -2698,7 +2710,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
 
   // We don't want to unroll the loops to the point where they do not fit into
   // the decoded cache. Assume that we only allow 32 IR instructions.
-  UF = std::min(UF, (32 / R.NumInstructions));
+  UF = std::min(UF, (MaxLoopSizeThreshold / R.NumInstructions));
 
   // Clamp the unroll factor ranges to reasonable factors.
   if (UF > MaxUnrollSize)
-- 
cgit v1.1


From 111e5fe7e089e0ffe73873848315ea5358120dfa Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 7 Jan 2013 23:13:00 +0000
Subject: LoopVectorizer: Add support for floating point reductions

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171812 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 149 ++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 58 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index b266d9d..cb6609f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -215,10 +215,6 @@ private:
   /// broadcast them into a vector.
   VectorParts &getVectorValue(Value *V);
 
-  /// Get a uniform vector of constant integers. We use this to get
-  /// vectors of ones and zeros for the reduction code.
-  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
-
   /// Generate a shuffle sequence that will reverse the vector Vec.
   Value *reverseVector(Value *Vec);
 
@@ -325,12 +321,14 @@ public:
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
-    NoReduction, ///< Not a reduction.
-    IntegerAdd,  ///< Sum of numbers.
-    IntegerMult, ///< Product of numbers.
-    IntegerOr,   ///< Bitwise or logical OR of numbers.
-    IntegerAnd,  ///< Bitwise or logical AND of numbers.
-    IntegerXor   ///< Bitwise or logical XOR of numbers.
+    RK_NoReduction, ///< Not a reduction.
+    RK_IntegerAdd,  ///< Sum of integers.
+    RK_IntegerMult, ///< Product of integers.
+    RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
+    RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
+    RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
+    RK_FloatAdd,    ///< Sum of floats.
+    RK_FloatMult    ///< Product of floats.
   };
 
   /// This enum represents the kinds of inductions that we support.
@@ -343,8 +341,8 @@ public:
 
   /// This POD struct holds information about reduction variables.
   struct ReductionDescriptor {
-    ReductionDescriptor() : StartValue(0), LoopExitInstr(0), Kind(NoReduction) {
-    }
+    ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
+      Kind(RK_NoReduction) {}
 
     ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
         : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
@@ -790,11 +788,6 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
   return WidenMap.get(V);
 }
 
-Constant*
-InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
-  return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
-}
-
 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
   assert(Vec->getType()->isVectorTy() && "Invalid type");
   SmallVector<Constant*, 8> ShuffleMask;
@@ -1215,20 +1208,26 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
-static unsigned
-getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
+static Constant*
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
   switch (K) {
-  case LoopVectorizationLegality::IntegerXor:
-  case LoopVectorizationLegality::IntegerAdd:
-  case LoopVectorizationLegality::IntegerOr:
+  case LoopVectorizationLegality:: RK_IntegerXor:
+  case LoopVectorizationLegality:: RK_IntegerAdd:
+  case LoopVectorizationLegality:: RK_IntegerOr:
     // Adding, Xoring, Oring zero to a number does not change it.
-    return 0;
-  case LoopVectorizationLegality::IntegerMult:
+    return ConstantInt::get(Tp, 0);
+  case LoopVectorizationLegality:: RK_IntegerMult:
     // Multiplying a number by 1 does not change it.
-    return 1;
-  case LoopVectorizationLegality::IntegerAnd:
+    return ConstantInt::get(Tp, 1);
+  case LoopVectorizationLegality:: RK_IntegerAnd:
     // AND-ing a number with an all-1 value does not change it.
-    return -1;
+    return ConstantInt::get(Tp, -1, true);
+  case LoopVectorizationLegality:: RK_FloatMult:
+    // Multiplying a number by 1 does not change it.
+    return ConstantFP::get(Tp, 1.0L);
+  case LoopVectorizationLegality:: RK_FloatAdd:
+    // Adding zero to a number does not change it.
+    return ConstantFP::get(Tp, 0.0L);
   default:
     llvm_unreachable("Unknown reduction kind");
   }
@@ -1329,8 +1328,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Find the reduction identity variable. Zero for addition, or, xor,
     // one for multiplication, -1 for And.
-    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
-                                          VecTy->getScalarType());
+    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
+    Constant *Identity = ConstantVector::getSplat(VF, Iden);
 
     // This vector is the Identity vector where the first element is the
     // incoming scalar reduction.
@@ -1378,26 +1377,34 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *ReducedPartRdx = RdxParts[0];
     for (unsigned part = 1; part < UF; ++part) {
       switch (RdxDesc.Kind) {
-      case LoopVectorizationLegality::IntegerAdd:
+      case LoopVectorizationLegality::RK_IntegerAdd:
         ReducedPartRdx = 
           Builder.CreateAdd(RdxParts[part], ReducedPartRdx, "add.rdx");
         break;
-      case LoopVectorizationLegality::IntegerMult:
+      case LoopVectorizationLegality::RK_IntegerMult:
         ReducedPartRdx =
           Builder.CreateMul(RdxParts[part], ReducedPartRdx, "mul.rdx");
         break;
-      case LoopVectorizationLegality::IntegerOr:
+      case LoopVectorizationLegality::RK_IntegerOr:
         ReducedPartRdx =
           Builder.CreateOr(RdxParts[part], ReducedPartRdx, "or.rdx");
         break;
-      case LoopVectorizationLegality::IntegerAnd:
+      case LoopVectorizationLegality::RK_IntegerAnd:
         ReducedPartRdx =
           Builder.CreateAnd(RdxParts[part], ReducedPartRdx, "and.rdx");
         break;
-      case LoopVectorizationLegality::IntegerXor:
+      case LoopVectorizationLegality::RK_IntegerXor:
         ReducedPartRdx =
           Builder.CreateXor(RdxParts[part], ReducedPartRdx, "xor.rdx");
         break;
+      case LoopVectorizationLegality::RK_FloatMult:
+        ReducedPartRdx =
+          Builder.CreateFMul(RdxParts[part], ReducedPartRdx, "fmul.rdx");
+        break;
+      case LoopVectorizationLegality::RK_FloatAdd:
+        ReducedPartRdx =
+          Builder.CreateFAdd(RdxParts[part], ReducedPartRdx, "fadd.rdx");
+        break;
       default:
         llvm_unreachable("Unknown reduction operation");
       }
@@ -1428,21 +1435,27 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
       // Emit the operation on the shuffled value.
       switch (RdxDesc.Kind) {
-      case LoopVectorizationLegality::IntegerAdd:
+      case LoopVectorizationLegality::RK_IntegerAdd:
         TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx");
         break;
-      case LoopVectorizationLegality::IntegerMult:
+      case LoopVectorizationLegality::RK_IntegerMult:
         TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx");
         break;
-      case LoopVectorizationLegality::IntegerOr:
+      case LoopVectorizationLegality::RK_IntegerOr:
         TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx");
         break;
-      case LoopVectorizationLegality::IntegerAnd:
+      case LoopVectorizationLegality::RK_IntegerAnd:
         TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx");
         break;
-      case LoopVectorizationLegality::IntegerXor:
+      case LoopVectorizationLegality::RK_IntegerXor:
         TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx");
         break;
+      case LoopVectorizationLegality::RK_FloatMult:
+        TmpVec = Builder.CreateFMul(TmpVec, Shuf, "fmul.rdx");
+        break;
+      case LoopVectorizationLegality::RK_FloatAdd:
+        TmpVec = Builder.CreateFAdd(TmpVec, Shuf, "fadd.rdx");
+        break;
       default:
         llvm_unreachable("Unknown reduction operation");
       }
@@ -2074,6 +2087,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
         // Check that this PHI type is allowed.
         if (!Phi->getType()->isIntegerTy() &&
+            !Phi->getType()->isFloatingPointTy() &&
             !Phi->getType()->isPointerTy()) {
           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
@@ -2105,26 +2119,34 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
-        if (AddReductionVar(Phi, IntegerAdd)) {
+        if (AddReductionVar(Phi, RK_IntegerAdd)) {
           DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
           continue;
         }
-        if (AddReductionVar(Phi, IntegerMult)) {
+        if (AddReductionVar(Phi, RK_IntegerMult)) {
           DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
           continue;
         }
-        if (AddReductionVar(Phi, IntegerOr)) {
+        if (AddReductionVar(Phi, RK_IntegerOr)) {
           DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
           continue;
         }
-        if (AddReductionVar(Phi, IntegerAnd)) {
+        if (AddReductionVar(Phi, RK_IntegerAnd)) {
           DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
           continue;
         }
-        if (AddReductionVar(Phi, IntegerXor)) {
+        if (AddReductionVar(Phi, RK_IntegerXor)) {
           DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
           continue;
         }
+        if (AddReductionVar(Phi, RK_FloatMult)) {
+          DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_FloatAdd)) {
+          DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
 
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
         return false;
@@ -2419,6 +2441,8 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // This includes users of the reduction, variables (which form a cycle
   // which ends in the phi node).
   Instruction *ExitInstruction = 0;
+  // Indicates that we found a binary operation in our scan.
+  bool FoundBinOp = false;
 
   // Iter is our iterator. We start with the PHI node and scan for all of the
   // users of this instruction. All users must be instructions that can be
@@ -2436,6 +2460,9 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     // Did we reach the initial PHI node already ?
     bool FoundStartPHI = false;
 
+    // Is this a bin op ?
+    FoundBinOp |= !isa<PHINode>(Iter);
+
     // For each of the *users* of iter.
     for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
          it != e; ++it) {
@@ -2475,7 +2502,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 
       // Reductions of instructions such as Div, and Sub is only
       // possible if the LHS is the reduction variable.
-      if (!U->isCommutative() && U->getOperand(0) != Iter)
+      if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
         return false;
 
       Iter = U;
@@ -2484,46 +2511,52 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     // We found a reduction var if we have reached the original
     // phi node and we only have a single instruction with out-of-loop
     // users.
-    if (FoundStartPHI && ExitInstruction) {
+    if (FoundStartPHI) {
       // This instruction is allowed to have out-of-loop users.
       AllowedExit.insert(ExitInstruction);
 
       // Save the description of this reduction variable.
       ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
       Reductions[Phi] = RD;
-      return true;
+      // We've ended the cycle. This is a reduction variable if we have an
+      // outside user and it has a binary op.
+      return FoundBinOp && ExitInstruction;
     }
-
-    // If we've reached the start PHI but did not find an outside user then
-    // this is dead code. Abort.
-    if (FoundStartPHI)
-      return false;
   }
 }
 
 bool
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
                                             ReductionKind Kind) {
+  bool FP = I->getType()->isFloatingPointTy();
+  bool FastMath = (FP && I->isCommutative() && I->isAssociative());
+
   switch (I->getOpcode()) {
   default:
     return false;
   case Instruction::PHI:
+      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
+        return false;
     // possibly.
     return true;
   case Instruction::Sub:
   case Instruction::Add:
-    return Kind == IntegerAdd;
+    return Kind == RK_IntegerAdd;
   case Instruction::SDiv:
   case Instruction::UDiv:
   case Instruction::Mul:
-    return Kind == IntegerMult;
+    return Kind == RK_IntegerMult;
   case Instruction::And:
-    return Kind == IntegerAnd;
+    return Kind == RK_IntegerAnd;
   case Instruction::Or:
-    return Kind == IntegerOr;
+    return Kind == RK_IntegerOr;
   case Instruction::Xor:
-    return Kind == IntegerXor;
-  }
+    return Kind == RK_IntegerXor;
+  case Instruction::FMul:
+    return Kind == RK_FloatMult && FastMath;
+  case Instruction::FAdd:
+    return Kind == RK_FloatAdd && FastMath;
+   }
 }
 
 LoopVectorizationLegality::InductionKind
-- 
cgit v1.1


From aae3d6fb53e1c472d9e25641a15a68c3c72c7129 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 8 Jan 2013 17:23:17 +0000
Subject: Rename the enum members to match the LLVM coding style.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171868 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 50 +++++++++++++++---------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb6609f..f37a0d8 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -333,10 +333,10 @@ public:
 
   /// This enum represents the kinds of inductions that we support.
   enum InductionKind {
-    NoInduction,         ///< Not an induction variable.
-    IntInduction,        ///< Integer induction variable. Step = 1.
-    ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
-    PtrInduction         ///< Pointer induction variable. Step = sizeof(elem).
+    IK_NoInduction,         ///< Not an induction variable.
+    IK_IntInduction,        ///< Integer induction variable. Step = 1.
+    IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
+    IK_PtrInduction         ///< Pointer induction variable. Step = sizeof(elem).
   };
 
   /// This POD struct holds information about reduction variables.
@@ -385,7 +385,7 @@ public:
   /// A POD for saving information about induction variables.
   struct InductionInfo {
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
-    InductionInfo() : StartValue(0), IK(NoInduction) {}
+    InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
     /// Start value.
     Value *StartValue;
     /// Induction kind.
@@ -735,7 +735,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
   if (Phi && Inductions.count(Phi)) {
     InductionInfo II = Inductions[Phi];
-    if (PtrInduction == II.IK)
+    if (IK_PtrInduction == II.IK)
       return 1;
   }
 
@@ -1089,9 +1089,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                          MiddleBlock->getTerminator());
     Value *EndValue = 0;
     switch (II.IK) {
-    case LoopVectorizationLegality::NoInduction:
+    case LoopVectorizationLegality::IK_NoInduction:
       llvm_unreachable("Unknown induction");
-    case LoopVectorizationLegality::IntInduction: {
+    case LoopVectorizationLegality::IK_IntInduction: {
       // Handle the integer induction counter:
       assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
       assert(OrigPhi == OldInduction && "Unknown integer PHI");
@@ -1101,7 +1101,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
       ResumeIndex = ResumeVal;
       break;
     }
-    case LoopVectorizationLegality::ReverseIntInduction: {
+    case LoopVectorizationLegality::IK_ReverseIntInduction: {
       // Convert the CountRoundDown variable to the PHI size.
       unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
       unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
@@ -1119,7 +1119,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                            BypassBlock->getTerminator());
       break;
     }
-    case LoopVectorizationLegality::PtrInduction: {
+    case LoopVectorizationLegality::IK_PtrInduction: {
       // For pointer induction variables, calculate the offset using
       // the end index.
       EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown,
@@ -1618,9 +1618,9 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         Legal->getInductionVars()->lookup(P);
 
       switch (II.IK) {
-      case LoopVectorizationLegality::NoInduction:
+      case LoopVectorizationLegality::IK_NoInduction:
         llvm_unreachable("Unknown induction");
-      case LoopVectorizationLegality::IntInduction: {
+      case LoopVectorizationLegality::IK_IntInduction: {
         assert(P == OldInduction && "Unexpected PHI");
         Value *Broadcasted = getBroadcastInstrs(Induction);
         // After broadcasting the induction variable we need to make the
@@ -1629,8 +1629,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
           Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
         continue;
       }
-      case LoopVectorizationLegality::ReverseIntInduction:
-      case LoopVectorizationLegality::PtrInduction:
+      case LoopVectorizationLegality::IK_ReverseIntInduction:
+      case LoopVectorizationLegality::IK_PtrInduction:
         // Handle reverse integer and pointer inductions.
         Value *StartIdx = 0;
         // If we have a single integer induction variable then use it.
@@ -1647,7 +1647,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
                                                  "normalized.idx");
 
         // Handle the reverse integer induction variable case.
-        if (LoopVectorizationLegality::ReverseIntInduction == II.IK) {
+        if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
           IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
           Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
                                                  "resize.norm.idx");
@@ -2104,9 +2104,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // Check if this is an induction variable.
         InductionKind IK = isInductionVariable(Phi);
 
-        if (NoInduction != IK) {
+        if (IK_NoInduction != IK) {
           // Int inductions are special because we only allow one IV.
-          if (IK == IntInduction) {
+          if (IK == IK_IntInduction) {
             if (Induction) {
               DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
               return false;
@@ -2564,37 +2564,37 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   Type *PhiTy = Phi->getType();
   // We only handle integer and pointer inductions variables.
   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
-    return NoInduction;
+    return IK_NoInduction;
 
   // Check that the PHI is consecutive and starts at zero.
   const SCEV *PhiScev = SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
   if (!AR) {
     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
-    return NoInduction;
+    return IK_NoInduction;
   }
   const SCEV *Step = AR->getStepRecurrence(*SE);
 
   // Integer inductions need to have a stride of one.
   if (PhiTy->isIntegerTy()) {
     if (Step->isOne())
-      return IntInduction;
+      return IK_IntInduction;
     if (Step->isAllOnesValue())
-      return ReverseIntInduction;
-    return NoInduction;
+      return IK_ReverseIntInduction;
+    return IK_NoInduction;
   }
 
   // Calculate the pointer stride and check if it is consecutive.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
   if (!C)
-    return NoInduction;
+    return IK_NoInduction;
 
   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
   uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
   if (C->getValue()->equalsInt(Size))
-    return PtrInduction;
+    return IK_PtrInduction;
 
-  return NoInduction;
+  return IK_NoInduction;
 }
 
 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
-- 
cgit v1.1


From 8327474e4b2c69d5eca4d62a01b89508705d0b40 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 8 Jan 2013 17:37:45 +0000
Subject: Code cleanup: refactor the switch statements in the generation of
 reduction variables into an IR builder call.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171871 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 87 ++++++++++--------------------
 1 file changed, 28 insertions(+), 59 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f37a0d8..9c82cb8 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1263,6 +1263,29 @@ isTriviallyVectorizableIntrinsic(Instruction *Inst) {
   return false;
 }
 
+/// This function translates the reduction kind to an LLVM binary operator.
+static Instruction::BinaryOps
+getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
+  switch (Kind) {
+    case LoopVectorizationLegality::RK_IntegerAdd:
+      return Instruction::Add;
+    case LoopVectorizationLegality::RK_IntegerMult:
+      return Instruction::Mul;
+    case LoopVectorizationLegality::RK_IntegerOr:
+      return Instruction::Or;
+    case LoopVectorizationLegality::RK_IntegerAnd:
+      return Instruction::And;
+    case LoopVectorizationLegality::RK_IntegerXor:
+      return Instruction::Xor;
+    case LoopVectorizationLegality::RK_FloatMult:
+      return Instruction::FMul;
+    case LoopVectorizationLegality::RK_FloatAdd:
+      return Instruction::FAdd;
+    default:
+      llvm_unreachable("Unknown reduction operation");
+  }
+}
+
 void
 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -1376,40 +1399,10 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
     for (unsigned part = 1; part < UF; ++part) {
-      switch (RdxDesc.Kind) {
-      case LoopVectorizationLegality::RK_IntegerAdd:
-        ReducedPartRdx = 
-          Builder.CreateAdd(RdxParts[part], ReducedPartRdx, "add.rdx");
-        break;
-      case LoopVectorizationLegality::RK_IntegerMult:
-        ReducedPartRdx =
-          Builder.CreateMul(RdxParts[part], ReducedPartRdx, "mul.rdx");
-        break;
-      case LoopVectorizationLegality::RK_IntegerOr:
-        ReducedPartRdx =
-          Builder.CreateOr(RdxParts[part], ReducedPartRdx, "or.rdx");
-        break;
-      case LoopVectorizationLegality::RK_IntegerAnd:
-        ReducedPartRdx =
-          Builder.CreateAnd(RdxParts[part], ReducedPartRdx, "and.rdx");
-        break;
-      case LoopVectorizationLegality::RK_IntegerXor:
-        ReducedPartRdx =
-          Builder.CreateXor(RdxParts[part], ReducedPartRdx, "xor.rdx");
-        break;
-      case LoopVectorizationLegality::RK_FloatMult:
-        ReducedPartRdx =
-          Builder.CreateFMul(RdxParts[part], ReducedPartRdx, "fmul.rdx");
-        break;
-      case LoopVectorizationLegality::RK_FloatAdd:
-        ReducedPartRdx =
-          Builder.CreateFAdd(RdxParts[part], ReducedPartRdx, "fadd.rdx");
-        break;
-      default:
-        llvm_unreachable("Unknown reduction operation");
-      }
+      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
+      ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
+                                           "bin.rdx");
     }
-    
 
     // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
     // and vector ops, reducing the set of values being computed by half each
@@ -1433,32 +1426,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      // Emit the operation on the shuffled value.
-      switch (RdxDesc.Kind) {
-      case LoopVectorizationLegality::RK_IntegerAdd:
-        TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx");
-        break;
-      case LoopVectorizationLegality::RK_IntegerMult:
-        TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx");
-        break;
-      case LoopVectorizationLegality::RK_IntegerOr:
-        TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx");
-        break;
-      case LoopVectorizationLegality::RK_IntegerAnd:
-        TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx");
-        break;
-      case LoopVectorizationLegality::RK_IntegerXor:
-        TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx");
-        break;
-      case LoopVectorizationLegality::RK_FloatMult:
-        TmpVec = Builder.CreateFMul(TmpVec, Shuf, "fmul.rdx");
-        break;
-      case LoopVectorizationLegality::RK_FloatAdd:
-        TmpVec = Builder.CreateFAdd(TmpVec, Shuf, "fadd.rdx");
-        break;
-      default:
-        llvm_unreachable("Unknown reduction operation");
-      }
+      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
+      TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
     }
 
     // The result is in the first element of the vector.
-- 
cgit v1.1