1 files changed, 308 insertions, 65 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 34d8a10..cb8a41d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -209,6 +209,29 @@ namespace {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 
+/// Optimization analysis message produced during vectorization. Messages inform
+/// the user why vectorization did not occur.
+class Report {
+  std::string Message;
+  raw_string_ostream Out;
+  Instruction *Instr;
+
+public:
+  Report(Instruction *I = nullptr) : Out(Message), Instr(I) {
+    Out << "loop not vectorized: ";
+  }
+
+  template <typename A> Report &operator<<(const A &Value) {
+    Out << Value;
+    return *this;
+  }
+
+  Instruction *getInstr() { return Instr; }
+
+  std::string &str() { return Out.str(); }
+  operator Twine() { return Out.str(); }
+};
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -515,10 +538,12 @@ public:
   unsigned NumPredStores;
 
   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
-                            DominatorTree *DT, TargetLibraryInfo *TLI)
+                            DominatorTree *DT, TargetLibraryInfo *TLI,
+                            Function *F)
       : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
-        DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr),
-        HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {}
+        DT(DT), TLI(TLI), TheFunction(F), Induction(nullptr),
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
+  }
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -747,6 +772,16 @@ private:
   /// invariant.
   void collectStridedAcccess(Value *LoadOrStoreInst);
 
+  /// Report an analysis message to assist the user in diagnosing loops that are
+  /// not vectorized.
+  void emitAnalysis(Report &Message) {
+    DebugLoc DL = TheLoop->getStartLoc();
+    if (Instruction *I = Message.getInstr())
+      DL = I->getDebugLoc();
+    emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
+                                   *TheFunction, DL, Message.str());
+  }
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
@@ -757,6 +792,8 @@ private:
   DominatorTree *DT;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
+  /// Parent function
+  Function *TheFunction;
 
   //  ---  vectorization state --- //
 
@@ -906,7 +943,7 @@ public:
   }
 
   /// Return the loop vectorizer metadata prefix.
-  static StringRef Prefix() { return "llvm.vectorizer."; }
+  static StringRef Prefix() { return "llvm.loop.vectorize."; }
 
   MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const {
     SmallVector<Value*, 2> Vals;
@@ -942,6 +979,29 @@ public:
     LoopID = NewLoopID;
   }
 
+  std::string emitRemark() const {
+    Report R;
+    R << "vectorization ";
+    switch (Force) {
+    case LoopVectorizeHints::FK_Disabled:
+      R << "is explicitly disabled";
+      break;
+    case LoopVectorizeHints::FK_Enabled:
+      R << "is explicitly enabled";
+      if (Width != 0 && Unroll != 0)
+        R << " with width " << Width << " and interleave count " << Unroll;
+      else if (Width != 0)
+        R << " with width " << Width;
+      else if (Unroll != 0)
+        R << " with interleave count " << Unroll;
+      break;
+    case LoopVectorizeHints::FK_Undefined:
+      R << "was not specified";
+      break;
+    }
+    return R.str();
+  }
+
   unsigned getWidth() const { return Width; }
   unsigned getUnroll() const { return Unroll; }
   enum ForceKind getForce() const { return Force; }
@@ -1125,18 +1185,37 @@ struct LoopVectorize : public FunctionPass {
                                 : "?")) << " width=" << Hints.getWidth()
                  << " unroll=" << Hints.getUnroll() << "\n");
 
+    // Function containing loop
+    Function *F = L->getHeader()->getParent();
+
+    // Looking at the diagnostic output is the only way to determine if a loop
+    // was vectorized (other than looking at the IR or machine code), so it
+    // is important to generate an optimization remark for each loop. Most of
+    // these messages are generated by emitOptimizationRemarkAnalysis. Remarks
+    // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are
+    // less verbose reporting vectorized loops and unvectorized loops that may
+    // benefit from vectorization, respectively.
+
     if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+      emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
+                                     L->getStartLoc(), Hints.emitRemark());
       return false;
     }
 
     if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+      emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
+                                     L->getStartLoc(), Hints.emitRemark());
       return false;
     }
 
     if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) {
       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+      emitOptimizationRemarkAnalysis(
+          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+          "loop not vectorized: vector width and interleave count are "
+          "explicitly set to 1");
       return false;
     }
 
@@ -1151,14 +1230,19 @@ struct LoopVectorize : public FunctionPass {
         DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
       else {
         DEBUG(dbgs() << "\n");
+        emitOptimizationRemarkAnalysis(
+            F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+            "vectorization is not beneficial and is not explicitly forced");
         return false;
       }
     }
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, F);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+      emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
+                                   L->getStartLoc(), Hints.emitRemark());
       return false;
     }
 
@@ -1167,7 +1251,6 @@ struct LoopVectorize : public FunctionPass {
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
-    Function *F = L->getHeader()->getParent();
     bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
                       F->hasFnAttribute(Attribute::OptimizeForSize);
 
@@ -1190,6 +1273,11 @@ struct LoopVectorize : public FunctionPass {
     if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
       DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
             "attribute is used.\n");
+      emitOptimizationRemarkAnalysis(
+          F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+          "loop not vectorized due to NoImplicitFloat attribute");
+      emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
+                                   L->getStartLoc(), Hints.emitRemark());
       return false;
     }
 
@@ -1208,9 +1296,14 @@ struct LoopVectorize : public FunctionPass {
     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
 
     if (VF.Width == 1) {
-      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-      if (UF == 1)
+      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
+
+      if (UF == 1) {
+        emitOptimizationRemarkAnalysis(
+            F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
+            "not beneficial to vectorize and user disabled interleaving");
         return false;
+      }
       DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
 
       // Report the unrolling decision.
@@ -1220,6 +1313,7 @@ struct LoopVectorize : public FunctionPass {
                                    " (vectorization not beneficial)"));
 
       // We decided not to vectorize, but we may want to unroll.
+
       InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
       Unroller.vectorize(&LVL);
     } else {
@@ -1909,20 +2003,23 @@ void InnerLoopVectorizer::createEmptyLoop() {
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-       [ ] <-- vector loop bypass (may consist of multiple blocks).
-     /  |
-    /   v
-   |   [ ]     <-- vector pre header.
-   |    |
-   |    v
-   |   [  ] \
-   |   [  ]_|   <-- vector loop.
-   |    |
-    \   v
-      >[ ]   <--- middle-block.
-     /  |
-    /   v
-   |   [ ]     <--- new preheader.
+       [ ] <-- Back-edge taken count overflow check.
+    /   |
+   /    v
+  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
+  |  /  |
+  | /   v
+  ||   [ ]     <-- vector pre header.
+  ||    |
+  ||    v
+  ||   [  ] \
+  ||   [  ]_|   <-- vector loop.
+  ||    |
+  | \   v
+  |   >[ ]   <--- middle-block.
+  |  /  |
+  | /   v
+  -|- >[ ]     <--- new preheader.
    |    |
    |    v
    |   [ ] \
@@ -1936,6 +2033,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(BypassBlock && "Invalid loop structure");
   assert(ExitBlock && "Must have an exit block");
 
   // Some loops have a single integer induction variable, while other loops
@@ -1958,18 +2056,30 @@ void InnerLoopVectorizer::createEmptyLoop() {
       IdxTy->getPrimitiveSizeInBits())
     ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
 
-  ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
+  const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
   // Get the total trip count from the count by adding 1.
-  ExitCount = SE->getAddExpr(ExitCount,
-                             SE->getConstant(ExitCount->getType(), 1));
+  ExitCount = SE->getAddExpr(BackedgeTakeCount,
+                             SE->getConstant(BackedgeTakeCount->getType(), 1));
 
   // Expand the trip count and place the new instructions in the preheader.
   // Notice that the pre-header does not change, only the loop body.
   SCEVExpander Exp(*SE, "induction");
 
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
-                                   BypassBlock->getTerminator());
+  // We need to test whether the backedge-taken count is uint##_max. Adding one
+  // to it will cause overflow and an incorrect loop trip count in the vector
+  // body. In case of overflow we want to directly jump to the scalar remainder
+  // loop.
+  Value *BackedgeCount =
+      Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
+                        BypassBlock->getTerminator());
+  if (BackedgeCount->getType()->isPointerTy())
+    BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
+                                                "backedge.ptrcnt.to.int",
+                                                BypassBlock->getTerminator());
+  Instruction *CheckBCOverflow =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
+                      Constant::getAllOnesValue(BackedgeCount->getType()),
+                      "backedge.overflow", BypassBlock->getTerminator());
 
   // The loop index does not have to start at Zero. Find the original start
   // value from the induction PHI node. If we don't have an induction variable
@@ -1980,7 +2090,18 @@ void InnerLoopVectorizer::createEmptyLoop() {
                        IdxTy):
     ConstantInt::get(IdxTy, 0);
 
-  assert(BypassBlock && "Invalid loop structure");
+  // We need an instruction to anchor the overflow check on. StartIdx needs to
+  // be defined before the overflow check branch. Because the scalar preheader
+  // is going to merge the start index and so the overflow branch block needs to
+  // contain a definition of the start index.
+  Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
+      StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
+      BypassBlock->getTerminator());
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                   BypassBlock->getTerminator());
+
   LoopBypassBlocks.push_back(BypassBlock);
 
   // Split the single block loop into the two loop structure described above.
@@ -2049,29 +2170,45 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
   // Now, compare the new count to zero. If it is zero skip the vector loop and
   // jump to the scalar loop.
-  Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
-                                          "cmp.zero");
+  Value *Cmp =
+      BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
 
   BasicBlock *LastBypassBlock = BypassBlock;
 
+  // Generate code to check that the loops trip count that we computed by adding
+  // one to the backedge-taken count will not overflow.
+  {
+    auto PastOverflowCheck =
+        std::next(BasicBlock::iterator(OverflowCheckAnchor));
+    BasicBlock *CheckBlock =
+      LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
+    if (ParentLoop)
+      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
+    LoopBypassBlocks.push_back(CheckBlock);
+    Instruction *OldTerm = LastBypassBlock->getTerminator();
+    BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
+    OldTerm->eraseFromParent();
+    LastBypassBlock = CheckBlock;
+  }
+
   // Generate the code to check that the strides we assumed to be one are really
   // one. We want the new basic block to start at the first instruction in a
   // sequence of instructions that form a check.
   Instruction *StrideCheck;
   Instruction *FirstCheckInst;
   std::tie(FirstCheckInst, StrideCheck) =
-      addStrideCheck(BypassBlock->getTerminator());
+      addStrideCheck(LastBypassBlock->getTerminator());
   if (StrideCheck) {
     // Create a new block containing the stride check.
     BasicBlock *CheckBlock =
-        BypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
+        LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
     if (ParentLoop)
       ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
     LoopBypassBlocks.push_back(CheckBlock);
 
     // Replace the branch into the memory check block with a conditional branch
     // for the "few elements case".
-    Instruction *OldTerm = BypassBlock->getTerminator();
+    Instruction *OldTerm = LastBypassBlock->getTerminator();
     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
     OldTerm->eraseFromParent();
 
@@ -2134,6 +2271,19 @@ void InnerLoopVectorizer::createEmptyLoop() {
       PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
                       MiddleBlock->getTerminator()) : nullptr;
 
+    // Create phi nodes to merge from the  backedge-taken check block.
+    PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
+                                           ScalarPH->getTerminator());
+    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
+
+    PHINode *BCTruncResumeVal = nullptr;
+    if (OrigPhi == OldInduction) {
+      BCTruncResumeVal =
+          PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
+                          ScalarPH->getTerminator());
+      BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
+    }
+
     Value *EndValue = nullptr;
     switch (II.IK) {
     case LoopVectorizationLegality::IK_NoInduction:
@@ -2150,10 +2300,12 @@ void InnerLoopVectorizer::createEmptyLoop() {
           BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
         // The new PHI merges the original incoming value, in case of a bypass,
         // or the value at the end of the vectorized loop.
-        for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
           TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
         TruncResumeVal->addIncoming(EndValue, VecBody);
 
+        BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+
         // We know what the end value is.
         EndValue = IdxEndRoundDown;
         // We also know which PHI node holds it.
@@ -2199,7 +2351,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
       if (OrigPhi == OldInduction)
         ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
       else
@@ -2209,11 +2361,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
     // Fix the scalar body counter (PHI node).
     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
-    // The old inductions phi node in the scalar body needs the truncated value.
-    if (OrigPhi == OldInduction)
-      OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
-    else
-      OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+
+    // The old induction's phi node in the scalar body needs the truncated
+    // value.
+    if (OrigPhi == OldInduction) {
+      BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
+      OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
+    } else {
+      BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
+      OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
+    }
   }
 
   // If we are generating a new induction variable then we also need to
@@ -2224,7 +2381,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
     assert(!ResumeIndex && "Unexpected resume value found");
     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
                                   MiddleBlock->getTerminator());
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
       ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
   }
@@ -2494,7 +2651,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
     // To do so, we need to generate the 'identity' vector and override
     // one of the elements with the incoming scalar reduction. We need
     // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
+    Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
 
     // This is the vector-clone of the value that leaves the loop.
     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@@ -2568,7 +2725,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
       VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
       Value *StartVal = (part == 0) ? VectorStart : Identity;
-      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
         NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
       NewPhi->addIncoming(RdxExitVal[part],
                           LoopVectorBody.back());
@@ -2626,6 +2783,13 @@ void InnerLoopVectorizer::vectorizeLoop() {
                                                     Builder.getInt32(0));
     }
 
+    // Create a phi node that merges control-flow from the backedge-taken check
+    // block and the middle block.
+    PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
+                                          LoopScalarPreHeader->getTerminator());
+    BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
+    BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
     // Now, we need to fix the users of the reduction variable
     // inside and outside of the scalar remainder loop.
     // We know that the loop is in LCSSA form. We need to update the
@@ -2655,7 +2819,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
     // Pick the other block.
     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
 
@@ -3062,9 +3226,14 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
         scalarizeInstruction(it);
         break;
       default:
+        bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
         for (unsigned Part = 0; Part < UF; ++Part) {
           SmallVector<Value *, 4> Args;
           for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+            if (HasScalarOpd && i == 1) {
+              Args.push_back(CI->getArgOperand(i));
+              continue;
+            }
             VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
             Args.push_back(Arg[Part]);
           }
@@ -3112,8 +3281,8 @@ void InnerLoopVectorizer::updateAnalysis() {
     }
   }
 
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
-  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
 
@@ -3138,8 +3307,10 @@ static bool canIfConvertPHINodes(BasicBlock *BB) {
 }
 
 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
-  if (!EnableIfConversion)
+  if (!EnableIfConversion) {
+    emitAnalysis(Report() << "if-conversion is disabled");
     return false;
+  }
 
   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
 
@@ -3169,16 +3340,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     BasicBlock *BB = *BI;
 
     // We don't support switch statements inside loops.
-    if (!isa<BranchInst>(BB->getTerminator()))
+    if (!isa<BranchInst>(BB->getTerminator())) {
+      emitAnalysis(Report(BB->getTerminator())
+                   << "loop contains a switch statement");
       return false;
+    }
 
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB)) {
-      if (!blockCanBePredicated(BB, SafePointes))
+      if (!blockCanBePredicated(BB, SafePointes)) {
+        emitAnalysis(Report(BB->getTerminator())
+                     << "control flow cannot be substituted for a select");
         return false;
-    } else if (BB != Header && !canIfConvertPHINodes(BB))
+      }
+    } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+      emitAnalysis(Report(BB->getTerminator())
+                   << "control flow cannot be substituted for a select");
       return false;
-
+    }
   }
 
   // We can if-convert this loop.
@@ -3188,20 +3367,31 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 bool LoopVectorizationLegality::canVectorize() {
   // We must have a loop in canonical form. Loops with indirectbr in them cannot
   // be canonicalized.
-  if (!TheLoop->getLoopPreheader())
+  if (!TheLoop->getLoopPreheader()) {
+    emitAnalysis(
+        Report() << "loop control flow is not understood by vectorizer");
     return false;
+  }
 
   // We can only vectorize innermost loops.
-  if (TheLoop->getSubLoopsVector().size())
+  if (TheLoop->getSubLoopsVector().size()) {
+    emitAnalysis(Report() << "loop is not the innermost loop");
     return false;
+  }
 
   // We must have a single backedge.
-  if (TheLoop->getNumBackEdges() != 1)
+  if (TheLoop->getNumBackEdges() != 1) {
+    emitAnalysis(
+        Report() << "loop control flow is not understood by vectorizer");
     return false;
+  }
 
   // We must have a single exiting block.
-  if (!TheLoop->getExitingBlock())
+  if (!TheLoop->getExitingBlock()) {
+    emitAnalysis(
+        Report() << "loop control flow is not understood by vectorizer");
     return false;
+  }
 
   // We need to have a loop header.
   DEBUG(dbgs() << "LV: Found a loop: " <<
@@ -3217,6 +3407,7 @@ bool LoopVectorizationLegality::canVectorize() {
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
   if (ExitCount == SE->getCouldNotCompute()) {
+    emitAnalysis(Report() << "could not determine number of loop iterations");
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
@@ -3310,6 +3501,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (!PhiTy->isIntegerTy() &&
             !PhiTy->isFloatingPointTy() &&
             !PhiTy->isPointerTy()) {
+          emitAnalysis(Report(it)
+                       << "loop control flow is not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
         }
@@ -3320,13 +3513,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (*bb != Header) {
           // Check that this instruction has no outside users or is an
           // identified reduction value with an outside user.
-          if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+          if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
             continue;
+          emitAnalysis(Report(it) << "value that could not be identified as "
+                                     "reduction is used outside the loop");
           return false;
         }
 
         // We only allow if-converted PHIs with more than two incoming values.
         if (Phi->getNumIncomingValues() != 2) {
+          emitAnalysis(Report(it)
+                       << "control flow not understood by vectorizer");
           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
           return false;
         }
@@ -3357,8 +3554,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
           // Until we explicitly handle the case of an induction variable with
           // an outside loop user we have to give up vectorizing this loop.
-          if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+          if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
+            emitAnalysis(Report(it) << "use of induction value outside of the "
+                                       "loop is not handled by vectorizer");
             return false;
+          }
 
           continue;
         }
@@ -3401,6 +3601,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
+        emitAnalysis(Report(it) << "unvectorizable operation");
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
         return false;
       }// end of PHI handling
@@ -3409,14 +3610,29 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // calls and we do handle certain intrinsic and libm functions.
       CallInst *CI = dyn_cast<CallInst>(it);
       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
+        emitAnalysis(Report(it) << "call instruction cannot be vectorized");
         DEBUG(dbgs() << "LV: Found a call site.\n");
         return false;
       }
 
+      // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
+      // second argument is the same (i.e. loop invariant)
+      if (CI &&
+          hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
+        if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
+          emitAnalysis(Report(it)
+                       << "intrinsic instruction cannot be vectorized");
+          DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
+          return false;
+        }
+      }
+
       // Check that the instruction return type is vectorizable.
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(it->getType()) &&
            !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
+        emitAnalysis(Report(it)
+                     << "instruction return type cannot be vectorized");
         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
         return false;
       }
@@ -3424,8 +3640,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // Check that the stored type is vectorizable.
       if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
         Type *T = ST->getValueOperand()->getType();
-        if (!VectorType::isValidElementType(T))
+        if (!VectorType::isValidElementType(T)) {
+          emitAnalysis(Report(ST) << "store instruction cannot be vectorized");
           return false;
+        }
         if (EnableMemAccessVersioning)
           collectStridedAcccess(ST);
       }
@@ -3436,8 +3654,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
-      if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+      if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
+        emitAnalysis(Report(it) << "value cannot be used outside the loop");
         return false;
+      }
 
     } // next instr.
 
@@ -3445,8 +3665,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
   if (!Induction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
-    if (Inductions.empty())
+    if (Inductions.empty()) {
+      emitAnalysis(Report()
+                   << "loop induction variable could not be identified");
       return false;
+    }
   }
 
   return true;
@@ -4353,8 +4576,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
           continue;
 
         LoadInst *Ld = dyn_cast<LoadInst>(it);
-        if (!Ld) return false;
-        if (!Ld->isSimple() && !IsAnnotatedParallel) {
+        if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+          emitAnalysis(Report(Ld)
+                       << "read with atomic ordering or volatile read");
           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
           return false;
         }
@@ -4367,8 +4591,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       // Save 'store' instructions. Abort if other instructions write to memory.
       if (it->mayWriteToMemory()) {
         StoreInst *St = dyn_cast<StoreInst>(it);
-        if (!St) return false;
+        if (!St) {
+          emitAnalysis(Report(it) << "instruction cannot be vectorized");
+          return false;
+        }
         if (!St->isSimple() && !IsAnnotatedParallel) {
+          emitAnalysis(Report(St)
+                       << "write with atomic ordering or volatile write");
           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
           return false;
         }
@@ -4405,6 +4634,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     Value* Ptr = ST->getPointerOperand();
 
     if (isUniform(Ptr)) {
+      emitAnalysis(
+          Report(ST)
+          << "write to a loop invariant address could not be vectorized");
       DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
       return false;
     }
@@ -4483,6 +4715,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   }
 
   if (NeedRTCheck && !CanDoRT) {
+    emitAnalysis(Report() << "cannot identify array bounds");
     DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
           "the array bounds.\n");
     PtrRtCheck.reset();
@@ -4513,6 +4746,14 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       // Check that we did not collect too many pointers or found an unsizeable
       // pointer.
       if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+        if (!CanDoRT && NumComparisons > 0)
+          emitAnalysis(Report()
+                       << "cannot check memory dependencies at runtime");
+        else
+          emitAnalysis(Report()
+                       << NumComparisons << " exceeds limit of "
+                       << RuntimeMemoryCheckThreshold
+                       << " dependent memory operations checked at runtime");
         DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
         PtrRtCheck.reset();
         return false;
@@ -4522,6 +4763,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     }
   }
 
+  if (!CanVecMem)
+    emitAnalysis(Report() << "unsafe dependent memory operations in loop");
+
   DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
         " need a runtime memory check.\n");
 
@@ -5774,4 +6018,3 @@ Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
   Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
   return Builder.CreateAdd(Val, C, "induction");
 }
-