36 files changed, 2547 insertions, 904 deletions
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index b63495b..a7bf188 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
@@ -68,10 +69,11 @@ static void FindUsedValues(GlobalVariable *LLVMUsed,
   if (LLVMUsed == 0) return;
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
 
-  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
-    if (GlobalValue *GV = 
-        dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
-      UsedValues.insert(GV);
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
+    Value *Operand = Inits->getOperand(i)->stripPointerCastsNoFollowAliases();
+    GlobalValue *GV = cast<GlobalValue>(Operand);
+    UsedValues.insert(GV);
+  }
 }
 
 // True if A is better than B.
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 49ef1e7..3fdb5f0 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -343,8 +343,9 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   if (Fn.isDeclaration() || Fn.mayBeOverridden())
     return false;
 
-  // Functions with local linkage should already have been handled.
-  if (Fn.hasLocalLinkage())
+  // Functions with local linkage should already have been handled, except the
+  // fragile (variadic) ones which we can improve here.
+  if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
     return false;
 
   if (Fn.use_empty())
@@ -604,9 +605,20 @@ void DAE::SurveyFunction(const Function &F) {
   UseVector MaybeLiveArgUses;
   for (Function::const_arg_iterator AI = F.arg_begin(),
        E = F.arg_end(); AI != E; ++AI, ++i) {
-    // See what the effect of this use is (recording any uses that cause
-    // MaybeLive in MaybeLiveArgUses).
-    Liveness Result = SurveyUses(AI, MaybeLiveArgUses);
+    Liveness Result;
+    if (F.getFunctionType()->isVarArg()) {
+      // Variadic functions will already have a va_arg function expanded inside
+      // them, making them potentially very sensitive to ABI changes resulting
+      // from removing arguments entirely, so don't. For example AArch64 handles
+      // register and stack HFAs very differently, and this is reflected in the
+      // IR which has already been generated.
+      Result = Live;
+    } else {
+      // See what the effect of this use is (recording any uses that cause
+      // MaybeLive in MaybeLiveArgUses). 
+      Result = SurveyUses(AI, MaybeLiveArgUses);
+    }
+
     // Mark the result.
     MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses);
     // Clear the vector again for the next iteration.
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index b035a82..a4de71b 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -3041,8 +3041,168 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
   return true;
 }
 
+/// \brief Given "llvm.used" or "llvm.compiler_used" as a global name, collect
+/// the initializer elements of that global in Set and return the global itself.
+static GlobalVariable *
+collectUsedGlobalVariables(const Module &M, const char *Name,
+                           SmallPtrSet<GlobalValue *, 8> &Set) {
+  GlobalVariable *GV = M.getGlobalVariable(Name);
+  if (!GV || !GV->hasInitializer())
+    return GV;
+
+  const ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (unsigned I = 0, E = Init->getNumOperands(); I != E; ++I) {
+    Value *Op = Init->getOperand(I);
+    GlobalValue *G = cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases());
+    Set.insert(G);
+  }
+  return GV;
+}
+
+static int compareNames(const void *A, const void *B) {
+  const GlobalValue *VA = *reinterpret_cast<GlobalValue* const*>(A);
+  const GlobalValue *VB = *reinterpret_cast<GlobalValue* const*>(B);
+  if (VA->getName() < VB->getName())
+    return -1;
+  if (VB->getName() < VA->getName())
+    return 1;
+  return 0;
+}
+
+static void setUsedInitializer(GlobalVariable &V,
+                               SmallPtrSet<GlobalValue *, 8> Init) {
+  SmallVector<llvm::Constant *, 8> UsedArray;
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext());
+
+  for (SmallPtrSet<GlobalValue *, 8>::iterator I = Init.begin(), E = Init.end();
+       I != E; ++I) {
+    Constant *Cast = llvm::ConstantExpr::getBitCast(*I, Int8PtrTy);
+    UsedArray.push_back(Cast);
+  }
+  // Sort to get deterministic order.
+  array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames);
+  ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size());
+
+  Module *M = V.getParent();
+  V.removeFromParent();
+  GlobalVariable *NV =
+      new GlobalVariable(*M, ATy, false, llvm::GlobalValue::AppendingLinkage,
+                         llvm::ConstantArray::get(ATy, UsedArray), "");
+  NV->takeName(&V);
+  NV->setSection("llvm.metadata");
+  delete &V;
+}
+
+namespace {
+/// \brief An easy to access representation of llvm.used and llvm.compiler_used.
+class LLVMUsed {
+  SmallPtrSet<GlobalValue *, 8> Used;
+  SmallPtrSet<GlobalValue *, 8> CompilerUsed;
+  GlobalVariable *UsedV;
+  GlobalVariable *CompilerUsedV;
+
+public:
+  LLVMUsed(const Module &M) {
+    UsedV = collectUsedGlobalVariables(M, "llvm.used", Used);
+    CompilerUsedV =
+        collectUsedGlobalVariables(M, "llvm.compiler_used", CompilerUsed);
+  }
+  typedef SmallPtrSet<GlobalValue *, 8>::iterator iterator;
+  iterator usedBegin() { return Used.begin(); }
+  iterator usedEnd() { return Used.end(); }
+  iterator compilerUsedBegin() { return CompilerUsed.begin(); }
+  iterator compilerUsedEnd() { return CompilerUsed.end(); }
+  bool usedCount(GlobalValue *GV) const { return Used.count(GV); }
+  bool compilerUsedCount(GlobalValue *GV) const {
+    return CompilerUsed.count(GV);
+  }
+  bool usedErase(GlobalValue *GV) { return Used.erase(GV); }
+  bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); }
+  bool usedInsert(GlobalValue *GV) { return Used.insert(GV); }
+  bool compilerUsedInsert(GlobalValue *GV) { return CompilerUsed.insert(GV); }
+
+  void syncVariablesAndSets() {
+    if (UsedV)
+      setUsedInitializer(*UsedV, Used);
+    if (CompilerUsedV)
+      setUsedInitializer(*CompilerUsedV, CompilerUsed);
+  }
+};
+}
+
+static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) {
+  if (GA.use_empty()) // No use at all.
+    return false;
+
+  assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) &&
+         "We should have removed the duplicated "
+         "element from llvm.compiler_used");
+  if (!GA.hasOneUse())
+    // Strictly more than one use. So at least one is not in llvm.used and
+    // llvm.compiler_used.
+    return true;
+
+  // Exactly one use. Check if it is in llvm.used or llvm.compiler_used.
+  return !U.usedCount(&GA) && !U.compilerUsedCount(&GA);
+}
+
+static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V,
+                                               const LLVMUsed &U) {
+  unsigned N = 2;
+  assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) &&
+         "We should have removed the duplicated "
+         "element from llvm.compiler_used");
+  if (U.usedCount(&V) || U.compilerUsedCount(&V))
+    ++N;
+  return V.hasNUsesOrMore(N);
+}
+
+static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) {
+  if (!GA.hasLocalLinkage())
+    return true;
+
+  return U.usedCount(&GA) || U.compilerUsedCount(&GA);
+}
+
+static bool hasUsesToReplace(GlobalAlias &GA, LLVMUsed &U, bool &RenameTarget) {
+  RenameTarget = false;
+  bool Ret = false;
+  if (hasUseOtherThanLLVMUsed(GA, U))
+    Ret = true;
+
+  // If the alias is externally visible, we may still be able to simplify it.
+  if (!mayHaveOtherReferences(GA, U))
+    return Ret;
+
+  // If the aliasee has internal linkage, give it the name and linkage
+  // of the alias, and delete the alias.  This turns:
+  //   define internal ... @f(...)
+  //   @a = alias ... @f
+  // into:
+  //   define ... @a(...)
+  Constant *Aliasee = GA.getAliasee();
+  GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
+  if (!Target->hasLocalLinkage())
+    return Ret;
+
+  // Do not perform the transform if multiple aliases potentially target the
+  // aliasee. This check also ensures that it is safe to replace the section
+  // and other attributes of the aliasee with those of the alias.
+  if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U))
+    return Ret;
+
+  RenameTarget = true;
+  return true;
+}
+
 bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
   bool Changed = false;
+  LLVMUsed Used(M);
+
+  for (SmallPtrSet<GlobalValue *, 8>::iterator I = Used.usedBegin(),
+                                               E = Used.usedEnd();
+       I != E; ++I)
+    Used.compilerUsedErase(*I);
 
   for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E;) {
@@ -3057,37 +3217,29 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
     Constant *Aliasee = J->getAliasee();
     GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
     Target->removeDeadConstantUsers();
-    bool hasOneUse = Target->hasOneUse() && Aliasee->hasOneUse();
 
     // Make all users of the alias use the aliasee instead.
-    if (!J->use_empty()) {
-      J->replaceAllUsesWith(Aliasee);
-      ++NumAliasesResolved;
-      Changed = true;
-    }
-
-    // If the alias is externally visible, we may still be able to simplify it.
-    if (!J->hasLocalLinkage()) {
-      // If the aliasee has internal linkage, give it the name and linkage
-      // of the alias, and delete the alias.  This turns:
-      //   define internal ... @f(...)
-      //   @a = alias ... @f
-      // into:
-      //   define ... @a(...)
-      if (!Target->hasLocalLinkage())
-        continue;
+    bool RenameTarget;
+    if (!hasUsesToReplace(*J, Used, RenameTarget))
+      continue;
 
-      // Do not perform the transform if multiple aliases potentially target the
-      // aliasee. This check also ensures that it is safe to replace the section
-      // and other attributes of the aliasee with those of the alias.
-      if (!hasOneUse)
-        continue;
+    J->replaceAllUsesWith(Aliasee);
+    ++NumAliasesResolved;
+    Changed = true;
 
+    if (RenameTarget) {
       // Give the aliasee the name, linkage and other attributes of the alias.
       Target->takeName(J);
       Target->setLinkage(J->getLinkage());
       Target->GlobalValue::copyAttributesFrom(J);
-    }
+
+      if (Used.usedErase(J))
+        Used.usedInsert(Target);
+
+      if (Used.compilerUsedErase(J))
+        Used.compilerUsedInsert(Target);
+    } else if (mayHaveOtherReferences(*J, Used))
+      continue;
 
     // Delete the alias.
     M.getAliasList().erase(J);
@@ -3095,6 +3247,8 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
     Changed = true;
   }
 
+  Used.syncVariablesAndSets();
+
   return Changed;
 }
 
@@ -3223,8 +3377,6 @@ bool GlobalOpt::runOnModule(Module &M) {
   // Try to find the llvm.globalctors list.
   GlobalVariable *GlobalCtors = FindGlobalCtors(M);
 
-  Function *CXAAtExitFn = FindCXAAtExit(M, TLI);
-
   bool LocalChange = true;
   while (LocalChange) {
     LocalChange = false;
@@ -3242,7 +3394,9 @@ bool GlobalOpt::runOnModule(Module &M) {
     // Resolve aliases, when possible.
     LocalChange |= OptimizeGlobalAliases(M);
 
-    // Try to remove trivial global destructors.
+    // Try to remove trivial global destructors if they are not removed
+    // already.
+    Function *CXAAtExitFn = FindCXAAtExit(M, TLI);
     if (CXAAtExitFn)
       LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
 
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 986c0b8..8ed7704 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -32,6 +32,12 @@ static cl::opt<bool>
 RunLoopVectorization("vectorize-loops",
                      cl::desc("Run the Loop vectorization passes"));
 
+// This is a helper flag that we use for testing the profitability of
+// vectorization on -O2 and -Os. It should go away once we make a decision.
+static cl::opt<bool>
+VectorizeO2("vectorize-o2",
+            cl::desc("Enable vectorization on all O levels"));
+
 static cl::opt<bool>
 RunSLPVectorization("vectorize-slp",
                     cl::desc("Run the SLP vectorization passes"));
@@ -192,7 +198,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
 
-  if (LoopVectorize && OptLevel > 2)
+  if (LoopVectorize && (OptLevel > 2 || VectorizeO2))
     MPM.add(createLoopVectorizePass());
 
   if (!DisableUnrollLoops)
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 3396f79..754eff6 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -332,16 +332,6 @@ bool StripDebugDeclare::runOnModule(Module &M) {
   return true;
 }
 
-/// getRealLinkageName - If special LLVM prefix that is used to inform the asm 
-/// printer to not emit usual symbol prefix before the symbol name is used then
-/// return linkage name after skipping this special LLVM prefix.
-static StringRef getRealLinkageName(StringRef LinkageName) {
-  char One = '\1';
-  if (LinkageName.startswith(StringRef(&One, 1)))
-    return LinkageName.substr(1);
-  return LinkageName;
-}
-
 bool StripDeadDebugInfo::runOnModule(Module &M) {
   bool Changed = false;
 
@@ -401,9 +391,8 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
         StringRef FName = DISubprogram(*I).getLinkageName();
         if (FName.empty())
           FName = DISubprogram(*I).getName();
-        if (NamedMDNode *LVNMD = 
-            M.getNamedMetadata(Twine("llvm.dbg.lv.", 
-                                     getRealLinkageName(FName)))) 
+        if (NamedMDNode *LVNMD = M.getNamedMetadata(
+                "llvm.dbg.lv." + Function::getRealLinkageName(FName)))
           LVNMD->eraseFromParent();
       }
     }
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 2a36074..b3084cc 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -1,4 +1,4 @@
-//===- InstCombine.h - Main InstCombine pass definition -------------------===//
+//===- InstCombine.h - Main InstCombine pass definition ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -234,6 +234,7 @@ private:
   bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
+  Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
 
 public:
   // InsertNewInstBefore - insert an instruction New before instruction Old
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index b96eb51..a2c545f 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -974,6 +974,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
           return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
                                            XorLHS);
       }
+      // (X + signbit) + C could have gotten canonicalized to (X ^ signbit) + C,
+      // transform them into (X + (signbit ^ C))
+      if (XorRHS->getValue().isSignBit())
+          return BinaryOperator::CreateAdd(XorLHS,
+                                           ConstantExpr::getXor(XorRHS, CI));
     }
   }
 
@@ -1232,6 +1237,74 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
     }
   }
 
+  // select C, 0, B + select C, A, 0 -> select C, A, B
+  {
+    Value *A1, *B1, *C1, *A2, *B2, *C2;
+    if (match(LHS, m_Select(m_Value(C1), m_Value(A1), m_Value(B1))) &&
+        match(RHS, m_Select(m_Value(C2), m_Value(A2), m_Value(B2)))) {
+      if (C1 == C2) {
+        Constant *Z1=0, *Z2=0;
+        Value *A, *B, *C=C1;
+        if (match(A1, m_AnyZero()) && match(B2, m_AnyZero())) {
+            Z1 = dyn_cast<Constant>(A1); A = A2;
+            Z2 = dyn_cast<Constant>(B2); B = B1;
+        } else if (match(B1, m_AnyZero()) && match(A2, m_AnyZero())) {
+            Z1 = dyn_cast<Constant>(B1); B = B2;
+            Z2 = dyn_cast<Constant>(A2); A = A1; 
+        }
+        
+        if (Z1 && Z2 && 
+            (I.hasNoSignedZeros() || 
+             (Z1->isNegativeZeroValue() && Z2->isNegativeZeroValue()))) {
+          return SelectInst::Create(C, A, B);
+        }
+      }
+    }
+  }
+
+  // A * (1 - uitofp i1 C) + B * (uitofp i1 C) -> select C, B, A
+  {
+    if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) {
+      Value *M1L, *M1R, *M2L, *M2R;
+      if (match(LHS, m_FMul(m_Value(M1L), m_Value(M1R))) &&
+          match(RHS, m_FMul(m_Value(M2L), m_Value(M2R)))) {
+
+        Value *A, *B, *C1, *C2;
+        if (!match(M1R, m_FSub(m_FPOne(), m_UIToFp(m_Value(C1)))))
+          std::swap(M1L, M1R);
+        if (!match(M2R, m_UIToFp(m_Value(C2)))) 
+          std::swap(M2L, M2R);
+
+        if (match(M1R, m_FSub(m_FPOne(), m_UIToFp(m_Value(C1)))) &&
+            match(M2R, m_UIToFp(m_Value(C2))) &&
+            C2->getType()->isIntegerTy(1) &&
+            C1 == C2) {
+          A = M1L;
+          B = M2L;
+          return SelectInst::Create(C1, B, A);
+        }
+        
+        std::swap(M1L, M2L);
+        std::swap(M1R, M2R);
+        
+        if (!match(M1R, m_FSub(m_FPOne(), m_UIToFp(m_Value(C1)))))
+          std::swap(M1L, M1R);
+        if (!match(M2R, m_UIToFp(m_Value(C2)))) 
+          std::swap(M2L, M2R);
+
+        if (match(M1R, m_FSub(m_FPOne(), m_UIToFp(m_Value(C1)))) &&
+            match(M2R, m_UIToFp(m_Value(C2))) &&
+            C2->getType()->isIntegerTy(1) &&
+            C1 == C2) {
+          A = M1L;
+          B = M2L;
+          return SelectInst::Create(C1, B, A);
+        }
+      }
+    }
+  }
+
+  
   if (I.hasUnsafeAlgebra()) {
     if (Value *V = FAddCombine(Builder).simplify(&I))
       return ReplaceInstUsesWith(I, V);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index ec75dd2..496fce6 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -173,14 +173,14 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
       // Adding a one to a single bit bit-field should be turned into an XOR
       // of the bit.  First thing to check is to see if this AND is with a
       // single bit constant.
-      const APInt &AndRHSV = cast<ConstantInt>(AndRHS)->getValue();
+      const APInt &AndRHSV = AndRHS->getValue();
 
       // If there is only one bit set.
       if (AndRHSV.isPowerOf2()) {
         // Ok, at this point, we know that we are masking the result of the
         // ADD down to exactly one bit.  If the constant we are adding has
         // no bits set below this bit, then we can eliminate the ADD.
-        const APInt& AddRHS = cast<ConstantInt>(OpRHS)->getValue();
+        const APInt& AddRHS = OpRHS->getValue();
 
         // Check to see if any bits below the one bit set in AndRHSV are set.
         if ((AddRHS & (AndRHSV-1)) == 0) {
@@ -209,8 +209,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     uint32_t BitWidth = AndRHS->getType()->getBitWidth();
     uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
     APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal));
-    ConstantInt *CI = ConstantInt::get(AndRHS->getContext(),
-                                       AndRHS->getValue() & ShlMask);
+    ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShlMask);
 
     if (CI->getValue() == ShlMask)
       // Masking out bits that the shift already masks.
@@ -230,8 +229,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     uint32_t BitWidth = AndRHS->getType()->getBitWidth();
     uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
     APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
-    ConstantInt *CI = ConstantInt::get(Op->getContext(),
-                                       AndRHS->getValue() & ShrMask);
+    ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShrMask);
 
     if (CI->getValue() == ShrMask)
       // Masking out bits that the shift already masks.
@@ -251,8 +249,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
       uint32_t BitWidth = AndRHS->getType()->getBitWidth();
       uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
       APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
-      Constant *C = ConstantInt::get(Op->getContext(),
-                                     AndRHS->getValue() & ShrMask);
+      Constant *C = Builder->getInt(AndRHS->getValue() & ShrMask);
       if (C == AndRHS) {          // Masking out bits shifted in.
         // (Val ashr C1) & C2 -> (Val lshr C1) & C2
         // Make the argument unsigned.
@@ -279,7 +276,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
 
   if (Inside) {
     if (Lo == Hi)  // Trivially false.
-      return ConstantInt::getFalse(V->getContext());
+      return Builder->getFalse();
 
     // V >= Min && V < Hi --> V < Hi
     if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
@@ -296,7 +293,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
   }
 
   if (Lo == Hi)  // Trivially true.
-    return ConstantInt::getTrue(V->getContext());
+    return Builder->getTrue();
 
   // V < Min || V >= Hi -> V > Hi-1
   Hi = SubOne(cast<ConstantInt>(Hi));
@@ -943,7 +940,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
         // If either of the constants are nans, then the whole thing returns
         // false.
         if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
-          return ConstantInt::getFalse(LHS->getContext());
+          return Builder->getFalse();
         return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
       }
 
@@ -1380,7 +1377,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
   // into a byteswap.  At least one of the two bytes would not be aligned with
   // their ultimate destination.
   if (!isPowerOf2_32(ByteMask)) return true;
-  unsigned InputByteNo = CountTrailingZeros_32(ByteMask);
+  unsigned InputByteNo = countTrailingZeros(ByteMask);
 
   // 2) The input and ultimate destinations must line up: if byte 3 of an i32
   // is demanded, it needs to go into byte 0 of the result.  This means that the
@@ -1588,7 +1585,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
     case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
     case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
-      return ConstantInt::getTrue(LHS->getContext());
+      return Builder->getTrue();
     }
   case ICmpInst::ICMP_ULT:
     switch (RHSCC) {
@@ -1640,7 +1637,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       break;
     case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
     case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
-      return ConstantInt::getTrue(LHS->getContext());
+      return Builder->getTrue();
     case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
       break;
     }
@@ -1655,7 +1652,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       break;
     case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
     case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
-      return ConstantInt::getTrue(LHS->getContext());
+      return Builder->getTrue();
     case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
       break;
     }
@@ -1676,7 +1673,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
         // If either of the constants are nans, then the whole thing returns
         // true.
         if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
-          return ConstantInt::getTrue(LHS->getContext());
+          return Builder->getTrue();
 
         // Otherwise, no need to compare the two constants, compare the
         // rest.
@@ -1779,8 +1776,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       Value *Or = Builder->CreateOr(X, RHS);
       Or->takeName(Op0);
       return BinaryOperator::CreateAnd(Or,
-                         ConstantInt::get(I.getContext(),
-                                          RHS->getValue() | C1->getValue()));
+                             Builder->getInt(RHS->getValue() | C1->getValue()));
     }
 
     // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
@@ -1789,8 +1785,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       Value *Or = Builder->CreateOr(X, RHS);
       Or->takeName(Op0);
       return BinaryOperator::CreateXor(Or,
-                 ConstantInt::get(I.getContext(),
-                                  C1->getValue() & ~RHS->getValue()));
+                            Builder->getInt(C1->getValue() & ~RHS->getValue()));
     }
 
     // Try to fold constant and into select arguments.
@@ -1872,15 +1867,13 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
             ((V1 == B && MaskedValueIsZero(V2, ~C1->getValue())) ||  // (V|N)
              (V2 == B && MaskedValueIsZero(V1, ~C1->getValue()))))   // (N|V)
           return BinaryOperator::CreateAnd(A,
-                               ConstantInt::get(A->getContext(),
-                                                C1->getValue()|C2->getValue()));
+                                Builder->getInt(C1->getValue()|C2->getValue()));
         // Or commutes, try both ways.
         if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
             ((V1 == A && MaskedValueIsZero(V2, ~C2->getValue())) ||  // (V|N)
              (V2 == A && MaskedValueIsZero(V1, ~C2->getValue()))))   // (N|V)
           return BinaryOperator::CreateAnd(B,
-                               ConstantInt::get(B->getContext(),
-                                                C1->getValue()|C2->getValue()));
+                                Builder->getInt(C1->getValue()|C2->getValue()));
 
         // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
         // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
@@ -1891,8 +1884,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
             (C4->getValue() & ~C2->getValue()) == 0) {
           V2 = Builder->CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
           return BinaryOperator::CreateAnd(V2,
-                               ConstantInt::get(B->getContext(),
-                                                C1->getValue()|C2->getValue()));
+                                Builder->getInt(C1->getValue()|C2->getValue()));
         }
       }
     }
@@ -2160,8 +2152,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         if (CI->hasOneUse() && Op0C->hasOneUse()) {
           Instruction::CastOps Opcode = Op0C->getOpcode();
           if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-              (RHS == ConstantExpr::getCast(Opcode,
-                                           ConstantInt::getTrue(I.getContext()),
+              (RHS == ConstantExpr::getCast(Opcode, Builder->getTrue(),
                                             Op0C->getDestTy()))) {
             CI->setPredicate(CI->getInversePredicate());
             return CastInst::Create(Opcode, CI, Op0C->getType());
@@ -2191,8 +2182,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
                                       Op0I->getOperand(0));
           } else if (RHS->getValue().isSignBit()) {
             // (X + C) ^ signbit -> (X + C + signbit)
-            Constant *C = ConstantInt::get(I.getContext(),
-                                           RHS->getValue() + Op0CI->getValue());
+            Constant *C = Builder->getInt(RHS->getValue() + Op0CI->getValue());
             return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
 
           }
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2ee1278..361acdd 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -677,7 +677,6 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
-  case Instruction::Shl:
     if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear) ||
         !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp))
       return false;
@@ -701,6 +700,17 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
     // Otherwise, we don't know how to analyze this BitsToClear case yet.
     return false;
 
+  case Instruction::Shl:
+    // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the
+    // upper bits we can reduce BitsToClear by the shift amount.
+    if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear))
+        return false;
+      uint64_t ShiftAmt = Amt->getZExtValue();
+      BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
+      return true;
+    }
+    return false;
   case Instruction::LShr:
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 4c252c0..af8a479 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -402,7 +402,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   if (SecondTrueElement != Overdefined) {
     // None true -> false.
     if (FirstTrueElement == Undefined)
-      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(GEP->getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getFalse());
 
     Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
 
@@ -422,7 +422,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   if (SecondFalseElement != Overdefined) {
     // None false -> true.
     if (FirstFalseElement == Undefined)
-      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(GEP->getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getTrue());
 
     Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
 
@@ -712,8 +712,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
       if (NumDifferences == 0)   // SAME GEP?
         return ReplaceInstUsesWith(I, // No comparison is needed here.
-                               ConstantInt::get(Type::getInt1Ty(I.getContext()),
-                                             ICmpInst::isTrueWhenEqual(Cond)));
+                             Builder->getInt1(ICmpInst::isTrueWhenEqual(Cond)));
 
       else if (NumDifferences == 1 && GEPsInBounds) {
         Value *LHSV = GEPLHS->getOperand(DiffOperand);
@@ -752,11 +751,11 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
 
   // (X+4) == X -> false.
   if (Pred == ICmpInst::ICMP_EQ)
-    return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext()));
+    return ReplaceInstUsesWith(ICI, Builder->getFalse());
 
   // (X+4) != X -> true.
   if (Pred == ICmpInst::ICMP_NE)
-    return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext()));
+    return ReplaceInstUsesWith(ICI, Builder->getTrue());
 
   // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
   // so the values can never be equal.  Similarly for all other "or equals"
@@ -798,7 +797,7 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
   // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128
 
   assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
-  Constant *C = ConstantInt::get(X->getContext(), CI->getValue()-1);
+  Constant *C = Builder->getInt(CI->getValue()-1);
   return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
 }
 
@@ -921,7 +920,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   default: llvm_unreachable("Unhandled icmp opcode!");
   case ICmpInst::ICMP_EQ:
     if (LoOverflow && HiOverflow)
-      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getFalse());
     if (HiOverflow)
       return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
                           ICmpInst::ICMP_UGE, X, LoBound);
@@ -932,7 +931,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
                                                     DivIsSigned, true));
   case ICmpInst::ICMP_NE:
     if (LoOverflow && HiOverflow)
-      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getTrue());
     if (HiOverflow)
       return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
                           ICmpInst::ICMP_ULT, X, LoBound);
@@ -944,16 +943,16 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_SLT:
     if (LoOverflow == +1)   // Low bound is greater than input range.
-      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getTrue());
     if (LoOverflow == -1)   // Low bound is less than input range.
-      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getFalse());
     return new ICmpInst(Pred, X, LoBound);
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_SGT:
     if (HiOverflow == +1)       // High bound greater than input range.
-      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getFalse());
     if (HiOverflow == -1)       // High bound less than input range.
-      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getTrue());
     if (Pred == ICmpInst::ICMP_UGT)
       return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
     return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
@@ -1017,7 +1016,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
   // If we are comparing against bits always shifted out, the
   // comparison cannot succeed.
   APInt Comp = CmpRHSV << ShAmtVal;
-  ConstantInt *ShiftedCmpRHS = ConstantInt::get(ICI.getContext(), Comp);
+  ConstantInt *ShiftedCmpRHS = Builder->getInt(Comp);
   if (Shr->getOpcode() == Instruction::LShr)
     Comp = Comp.lshr(ShAmtVal);
   else
@@ -1025,8 +1024,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
 
   if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero.
     bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
-    Constant *Cst = ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
-                                     IsICMP_NE);
+    Constant *Cst = Builder->getInt1(IsICMP_NE);
     return ReplaceInstUsesWith(ICI, Cst);
   }
 
@@ -1039,7 +1037,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
   if (Shr->hasOneUse()) {
     // Otherwise strength reduce the shift into an and.
     APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
-    Constant *Mask = ConstantInt::get(ICI.getContext(), Val);
+    Constant *Mask = Builder->getInt(Val);
 
     Value *And = Builder->CreateAnd(Shr->getOperand(0),
                                     Mask, Shr->getName()+".mask");
@@ -1072,7 +1070,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         APInt NewRHS = RHS->getValue().zext(SrcBits);
         NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits);
         return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
-                            ConstantInt::get(ICI.getContext(), NewRHS));
+                            Builder->getInt(NewRHS));
       }
     }
     break;
@@ -1115,8 +1113,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
                                          ? ICI.getUnsignedPredicate()
                                          : ICI.getSignedPredicate();
           return new ICmpInst(Pred, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),
-                                               RHSV ^ SignBit));
+                              Builder->getInt(RHSV ^ SignBit));
         }
 
         // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A)
@@ -1127,8 +1124,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
                                          : ICI.getSignedPredicate();
           Pred = ICI.getSwappedPredicate(Pred);
           return new ICmpInst(Pred, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),
-                                               RHSV ^ NotSignBit));
+                              Builder->getInt(RHSV ^ NotSignBit));
         }
       }
     }
@@ -1218,11 +1214,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
             // As a special case, check to see if this means that the
             // result is always true or false now.
             if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
-              return ReplaceInstUsesWith(ICI,
-                                       ConstantInt::getFalse(ICI.getContext()));
+              return ReplaceInstUsesWith(ICI, Builder->getFalse());
             if (ICI.getPredicate() == ICmpInst::ICMP_NE)
-              return ReplaceInstUsesWith(ICI,
-                                       ConstantInt::getTrue(ICI.getContext()));
+              return ReplaceInstUsesWith(ICI, Builder->getTrue());
           } else {
             ICI.setOperand(1, NewCst);
             Constant *NewAndCST;
@@ -1344,8 +1338,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
                                                                  ShAmt);
       if (Comp != RHS) {// Comparing against a bit that we know is zero.
         bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
-        Constant *Cst =
-          ConstantInt::get(Type::getInt1Ty(ICI.getContext()), IsICMP_NE);
+        Constant *Cst = Builder->getInt1(IsICMP_NE);
         return ReplaceInstUsesWith(ICI, Cst);
       }
 
@@ -1364,9 +1357,8 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       if (LHSI->hasOneUse()) {
         // Otherwise strength reduce the shift into an and.
         uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
-        Constant *Mask =
-          ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits,
-                                                       TypeBits-ShAmtVal));
+        Constant *Mask = Builder->getInt(APInt::getLowBitsSet(TypeBits,
+                                                          TypeBits - ShAmtVal));
 
         Value *And =
           Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask");
@@ -1464,18 +1456,18 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       if (ICI.isSigned()) {
         if (CR.getLower().isSignBit()) {
           return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),CR.getUpper()));
+                              Builder->getInt(CR.getUpper()));
         } else if (CR.getUpper().isSignBit()) {
           return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),CR.getLower()));
+                              Builder->getInt(CR.getLower()));
         }
       } else {
         if (CR.getLower().isMinValue()) {
           return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),CR.getUpper()));
+                              Builder->getInt(CR.getUpper()));
         } else if (CR.getUpper().isMinValue()) {
           return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),CR.getLower()));
+                              Builder->getInt(CR.getLower()));
         }
       }
     }
@@ -1555,9 +1547,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
           Constant *NotCI = ConstantExpr::getNot(RHS);
           if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
-            return ReplaceInstUsesWith(ICI,
-                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
-                                       isICMP_NE));
+            return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
         }
         break;
 
@@ -1566,9 +1556,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           // If bits are being compared against that are and'd out, then the
           // comparison can never succeed!
           if ((RHSV & ~BOC->getValue()) != 0)
-            return ReplaceInstUsesWith(ICI,
-                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
-                                       isICMP_NE));
+            return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
 
           // If we have ((X & C) == C), turn it into ((X & C) != 0).
           if (RHS == BOC && RHSV.isPowerOf2())
@@ -1619,7 +1607,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       case Intrinsic::bswap:
         Worklist.Add(II);
         ICI.setOperand(0, II->getArgOperand(0));
-        ICI.setOperand(1, ConstantInt::get(II->getContext(), RHSV.byteSwap()));
+        ICI.setOperand(1, Builder->getInt(RHSV.byteSwap()));
         return &ICI;
       case Intrinsic::ctlz:
       case Intrinsic::cttz:
@@ -2041,19 +2029,19 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     case ICmpInst::ICMP_ULE:
       assert(!CI->isMaxValue(false));                 // A <=u MAX -> TRUE
       return new ICmpInst(ICmpInst::ICMP_ULT, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+                          Builder->getInt(CI->getValue()+1));
     case ICmpInst::ICMP_SLE:
       assert(!CI->isMaxValue(true));                  // A <=s MAX -> TRUE
       return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+                          Builder->getInt(CI->getValue()+1));
     case ICmpInst::ICMP_UGE:
       assert(!CI->isMinValue(false));                 // A >=u MIN -> TRUE
       return new ICmpInst(ICmpInst::ICMP_UGT, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+                          Builder->getInt(CI->getValue()-1));
     case ICmpInst::ICMP_SGE:
       assert(!CI->isMinValue(true));                  // A >=s MIN -> TRUE
       return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+                          Builder->getInt(CI->getValue()-1));
     }
 
     // If this comparison is a normal comparison, it demands all
@@ -2192,7 +2180,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
         if (Op1Max == Op0Min+1)        // A <u C -> A == C-1 if min(A)+1 == C
           return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+                              Builder->getInt(CI->getValue()-1));
 
         // (x <u 2147483648) -> (x >s -1)  -> true if sign bit clear
         if (CI->isMinValue(true))
@@ -2211,7 +2199,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
         if (Op1Min == Op0Max-1)        // A >u C -> A == C+1 if max(a)-1 == C
           return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+                              Builder->getInt(CI->getValue()+1));
 
         // (x >u 2147483647) -> (x <s 0)  -> true if sign bit set
         if (CI->isMaxValue(true))
@@ -2229,7 +2217,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
         if (Op1Max == Op0Min+1)        // A <s C -> A == C-1 if min(A)+1 == C
           return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+                              Builder->getInt(CI->getValue()-1));
       }
       break;
     case ICmpInst::ICMP_SGT:
@@ -2243,7 +2231,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
         if (Op1Min == Op0Max-1)        // A >s C -> A == C+1 if max(A)-1 == C
           return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+                              Builder->getInt(CI->getValue()+1));
       }
       break;
     case ICmpInst::ICMP_SGE:
@@ -2719,8 +2707,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         ConstantInt *C1, *C2;
         if (match(B, m_ConstantInt(C1)) &&
             match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) {
-          Constant *NC = ConstantInt::get(I.getContext(),
-                                          C1->getValue() ^ C2->getValue());
+          Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue());
           Value *Xor = Builder->CreateXor(C, NC);
           return new ICmpInst(I.getPredicate(), A, Xor);
         }
@@ -2885,9 +2872,9 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     Pred = ICmpInst::ICMP_NE;
     break;
   case FCmpInst::FCMP_ORD:
-    return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+    return ReplaceInstUsesWith(I, Builder->getTrue());
   case FCmpInst::FCMP_UNO:
-    return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+    return ReplaceInstUsesWith(I, Builder->getFalse());
   }
 
   IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
@@ -2907,8 +2894,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
           Pred == ICmpInst::ICMP_SLE)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
-      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
+      return ReplaceInstUsesWith(I, Builder->getFalse());
     }
   } else {
     // If the RHS value is > UnsignedMax, fold the comparison. This handles
@@ -2919,8 +2906,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
           Pred == ICmpInst::ICMP_ULE)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
-      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
+      return ReplaceInstUsesWith(I, Builder->getFalse());
     }
   }
 
@@ -2932,8 +2919,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
           Pred == ICmpInst::ICMP_SGE)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
-      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
+      return ReplaceInstUsesWith(I, Builder->getFalse());
     }
   } else {
     // See if the RHS value is < UnsignedMin.
@@ -2943,8 +2930,8 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
           Pred == ICmpInst::ICMP_UGE)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
-      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
+      return ReplaceInstUsesWith(I, Builder->getFalse());
     }
   }
 
@@ -2966,14 +2953,14 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
       switch (Pred) {
       default: llvm_unreachable("Unexpected integer comparison!");
       case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
       case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getFalse());
       case ICmpInst::ICMP_ULE:
         // (float)int <= 4.4   --> int <= 4
         // (float)int <= -4.4  --> false
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+          return ReplaceInstUsesWith(I, Builder->getFalse());
         break;
       case ICmpInst::ICMP_SLE:
         // (float)int <= 4.4   --> int <= 4
@@ -2985,7 +2972,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         // (float)int < -4.4   --> false
         // (float)int < 4.4    --> int <= 4
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+          return ReplaceInstUsesWith(I, Builder->getFalse());
         Pred = ICmpInst::ICMP_ULE;
         break;
       case ICmpInst::ICMP_SLT:
@@ -2998,7 +2985,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         // (float)int > 4.4    --> int > 4
         // (float)int > -4.4   --> true
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+          return ReplaceInstUsesWith(I, Builder->getTrue());
         break;
       case ICmpInst::ICMP_SGT:
         // (float)int > 4.4    --> int > 4
@@ -3010,7 +2997,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         // (float)int >= -4.4   --> true
         // (float)int >= 4.4    --> int > 4
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+          return ReplaceInstUsesWith(I, Builder->getTrue());
         Pred = ICmpInst::ICMP_UGT;
         break;
       case ICmpInst::ICMP_SGE:
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index df73906..e36b762 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -95,6 +95,25 @@ static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
   return MulExt.slt(Min) || MulExt.sgt(Max);
 }
 
+/// \brief A helper routine of InstCombiner::visitMul().
+///
+/// If C is a vector of known powers of 2, then this function returns
+/// a new vector obtained from C replacing each element with its logBase2.
+/// Return a null pointer otherwise.
+static Constant *getLogBase2Vector(ConstantDataVector *CV) {
+  const APInt *IVal;
+  SmallVector<Constant *, 4> Elts;
+
+  for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) {
+    Constant *Elt = CV->getElementAsConstant(I);
+    if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2())
+      return 0;
+    Elts.push_back(ConstantInt::get(Elt->getType(), IVal->logBase2()));
+  }
+
+  return ConstantVector::get(Elts);
+}
+
 Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -108,24 +127,37 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (match(Op1, m_AllOnes()))  // X * -1 == 0 - X
     return BinaryOperator::CreateNeg(Op0, I.getName());
 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
-
-    // ((X << C1)*C2) == (X * (C2 << C1))
-    if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0))
-      if (SI->getOpcode() == Instruction::Shl)
-        if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1)))
-          return BinaryOperator::CreateMul(SI->getOperand(0),
-                                           ConstantExpr::getShl(CI, ShOp));
-
-    const APInt &Val = CI->getValue();
-    if (Val.isPowerOf2()) {          // Replace X*(2^C) with X << C
-      Constant *NewCst = ConstantInt::get(Op0->getType(), Val.logBase2());
-      BinaryOperator *Shl = BinaryOperator::CreateShl(Op0, NewCst);
-      if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap();
-      if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap();
-      return Shl;
+  // Also allow combining multiply instructions on vectors.
+  {
+    Value *NewOp;
+    Constant *C1, *C2;
+    const APInt *IVal;
+    if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)),
+                        m_Constant(C1))) &&
+        match(C1, m_APInt(IVal)))
+      // ((X << C1)*C2) == (X * (C2 << C1))
+      return BinaryOperator::CreateMul(NewOp, ConstantExpr::getShl(C1, C2));
+
+    if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
+      Constant *NewCst = 0;
+      if (match(C1, m_APInt(IVal)) && IVal->isPowerOf2())
+        // Replace X*(2^C) with X << C, where C is either a scalar or a splat.
+        NewCst = ConstantInt::get(NewOp->getType(), IVal->logBase2());
+      else if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(C1))
+        // Replace X*(2^C) with X << C, where C is a vector of known
+        // constant powers of 2.
+        NewCst = getLogBase2Vector(CV);
+
+      if (NewCst) {
+        BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
+        if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap();
+        if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap();
+        return Shl;
+      }
     }
+  }
 
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
     // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
     { Value *X; ConstantInt *C1;
       if (Op0->hasOneUse() &&
@@ -584,8 +616,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
         *I = SI->getOperand(NonNullOperand);
         Worklist.Add(BBI);
       } else if (*I == SelectCond) {
-        *I = NonNullOperand == 1 ? ConstantInt::getTrue(BBI->getContext()) :
-                                   ConstantInt::getFalse(BBI->getContext());
+        *I = Builder->getInt1(NonNullOperand == 1);
         Worklist.Add(BBI);
       }
     }
@@ -817,7 +848,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
 /// FP value and:
 ///    1) 1/C is exact, or
 ///    2) reciprocal is allowed.
-/// If the convertion was successful, the simplified expression "X * 1/C" is
+/// If the conversion was successful, the simplified expression "X * 1/C" is
 /// returned; otherwise, NULL is returned.
 ///
 static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
@@ -998,37 +1029,19 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   if (Instruction *common = commonIRemTransforms(I))
     return common;
 
-  // X urem C^2 -> X and C-1
-  { const APInt *C;
-    if (match(Op1, m_Power2(C)))
-      return BinaryOperator::CreateAnd(Op0,
-                                       ConstantInt::get(I.getType(), *C-1));
-  }
-
-  // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)
-  if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
-    Constant *N1 = Constant::getAllOnesValue(I.getType());
-    Value *Add = Builder->CreateAdd(Op1, N1);
-    return BinaryOperator::CreateAnd(Op0, Add);
-  }
-
-  // urem X, (select Cond, 2^C1, 2^C2) -->
-  //    select Cond, (and X, C1-1), (and X, C2-1)
-  // when C1&C2 are powers of two.
-  { Value *Cond; const APInt *C1, *C2;
-    if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) {
-      Value *TrueAnd = Builder->CreateAnd(Op0, *C1-1, Op1->getName()+".t");
-      Value *FalseAnd = Builder->CreateAnd(Op0, *C2-1, Op1->getName()+".f");
-      return SelectInst::Create(Cond, TrueAnd, FalseAnd);
-    }
-  }
-
   // (zext A) urem (zext B) --> zext (A urem B)
   if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
     if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
       return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1),
                           I.getType());
 
+  // X urem Y -> X and Y-1, where Y is a power of 2,
+  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true)) {
+    Constant *N1 = Constant::getAllOnesValue(I.getType());
+    Value *Add = Builder->CreateAdd(Op1, N1);
+    return BinaryOperator::CreateAnd(Op0, Add);
+  }
+
   return 0;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 2defe63..59502fb 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -974,7 +974,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     return &SI;
   }
 
-  if (VectorType *VecTy = dyn_cast<VectorType>(SI.getType())) {
+  if (VectorType* VecTy = dyn_cast<VectorType>(SI.getType())) {
     unsigned VWidth = VecTy->getNumElements();
     APInt UndefElts(VWidth, 0);
     APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
@@ -984,24 +984,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       return &SI;
     }
 
-    if (ConstantVector *CV = dyn_cast<ConstantVector>(CondVal)) {
-      // Form a shufflevector instruction.
-      SmallVector<Constant *, 8> Mask(VWidth);
-      Type *Int32Ty = Type::getInt32Ty(CV->getContext());
-      for (unsigned i = 0; i != VWidth; ++i) {
-        Constant *Elem = cast<Constant>(CV->getOperand(i));
-        if (ConstantInt *E = dyn_cast<ConstantInt>(Elem))
-          Mask[i] = ConstantInt::get(Int32Ty, i + (E->isZero() ? VWidth : 0));
-        else if (isa<UndefValue>(Elem))
-          Mask[i] = UndefValue::get(Int32Ty);
-        else
-          return 0;
-      }
-      Constant *MaskVal = ConstantVector::get(Mask);
-      Value *V = Builder->CreateShuffleVector(TrueVal, FalseVal, MaskVal);
-      return ReplaceInstUsesWith(SI, V);
-    }
-
     if (isa<ConstantAggregateZero>(CondVal)) {
       return ReplaceInstUsesWith(SI, FalseVal);
     }
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8add1ea..a7bfe09 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -754,7 +754,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
-        KnownZero |= LHSKnownZero;
+        KnownZero.setBit(KnownZero.getBitWidth() - 1);
     }
     break;
   case Instruction::URem: {
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index de8a3ac..d43093d 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -125,17 +125,15 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
   // and that it is a binary operation which is cheap to scalarize.
   // otherwise return NULL.
   if (!PHIUser->hasOneUse() || !(PHIUser->use_back() == PN) ||
-    !(isa<BinaryOperator>(PHIUser)) ||
-    !CheapToScalarize(PHIUser, true))
+      !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true))
     return NULL;
 
   // Create a scalar PHI node that will replace the vector PHI node
   // just before the current PHI node.
-  PHINode * scalarPHI = cast<PHINode>(
-    InsertNewInstWith(PHINode::Create(EI.getType(),
-    PN->getNumIncomingValues(), ""), *PN));
+  PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith(
+      PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN));
   // Scalarize each PHI operand.
-  for (unsigned i=0; i < PN->getNumIncomingValues(); i++) {
+  for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
     Value *PHIInVal = PN->getIncomingValue(i);
     BasicBlock *inBB = PN->getIncomingBlock(i);
     Value *Elt = EI.getIndexOperand();
@@ -145,17 +143,17 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
       // scalar PHI and the second operand is extracted from the other
       // vector operand.
       BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
-      unsigned opId = (B0->getOperand(0) == PN) ? 1: 0;
-      Value *Op = Builder->CreateExtractElement(
-        B0->getOperand(opId), Elt, B0->getOperand(opId)->getName()+".Elt");
+      unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
+      Value *Op = InsertNewInstWith(
+          ExtractElementInst::Create(B0->getOperand(opId), Elt,
+                                     B0->getOperand(opId)->getName() + ".Elt"),
+          *B0);
       Value *newPHIUser = InsertNewInstWith(
-        BinaryOperator::Create(B0->getOpcode(), scalarPHI,Op),
-        *B0);
+          BinaryOperator::Create(B0->getOpcode(), scalarPHI, Op), *B0);
       scalarPHI->addIncoming(newPHIUser, inBB);
     } else {
       // Scalarize PHI input:
-      Instruction *newEI =
-        ExtractElementInst::Create(PHIInVal, Elt, "");
+      Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, "");
       // Insert the new instruction into the predecessor basic block.
       Instruction *pos = dyn_cast<Instruction>(PHIInVal);
       BasicBlock::iterator InsertPos;
@@ -222,9 +220,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
     // If there's a vector PHI feeding a scalar use through this extractelement
     // instruction, try to scalarize the PHI.
     if (PHINode *PN = dyn_cast<PHINode>(EI.getOperand(0))) {
-    	Instruction *scalarPHI = scalarizePHI(EI, PN);
-    	if (scalarPHI)
-    		return (scalarPHI);
+      Instruction *scalarPHI = scalarizePHI(EI, PN);
+      if (scalarPHI)
+        return scalarPHI;
     }
   }
 
@@ -496,6 +494,252 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   return 0;
 }
 
+/// Return true if we can evaluate the specified expression tree if the vector
+/// elements were shuffled in a different order.
+static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,
+                                unsigned Depth = 5) {
+  // We can always reorder the elements of a constant.
+  if (isa<Constant>(V))
+    return true;
+
+  // We won't reorder vector arguments. No IPO here.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Two users may expect different orders of the elements. Don't try it.
+  if (!I->hasOneUse())
+    return false;
+
+  if (Depth == 0) return false;
+
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::GetElementPtr: {
+      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
+        if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1))
+          return false;
+      }
+      return true;
+    }
+    case Instruction::InsertElement: {
+      ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
+      if (!CI) return false;
+      int ElementNumber = CI->getLimitedValue();
+
+      // Verify that 'CI' does not occur twice in Mask. A single 'insertelement'
+      // can't put an element into multiple indices.
+      bool SeenOnce = false;
+      for (int i = 0, e = Mask.size(); i != e; ++i) {
+        if (Mask[i] == ElementNumber) {
+          if (SeenOnce)
+            return false;
+          SeenOnce = true;
+        }
+      }
+      return CanEvaluateShuffled(I->getOperand(0), Mask, Depth-1);
+    }
+  }
+  return false;
+}
+
+/// Rebuild a new instruction just like 'I' but with the new operands given.
+/// In the event of type mismatch, the type of the operands is correct.
+static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) {
+  // We don't want to use the IRBuilder here because we want the replacement
+  // instructions to appear next to 'I', not the builder's insertion point.
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      BinaryOperator *BO = cast<BinaryOperator>(I);
+      assert(NewOps.size() == 2 && "binary operator with #ops != 2");
+      BinaryOperator *New =
+          BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(),
+                                 NewOps[0], NewOps[1], "", BO);
+      if (isa<OverflowingBinaryOperator>(BO)) {
+        New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap());
+        New->setHasNoSignedWrap(BO->hasNoSignedWrap());
+      }
+      if (isa<PossiblyExactOperator>(BO)) {
+        New->setIsExact(BO->isExact());
+      }
+      return New;
+    }
+    case Instruction::ICmp:
+      assert(NewOps.size() == 2 && "icmp with #ops != 2");
+      return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(),
+                          NewOps[0], NewOps[1]);
+    case Instruction::FCmp:
+      assert(NewOps.size() == 2 && "fcmp with #ops != 2");
+      return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(),
+                          NewOps[0], NewOps[1]);
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt: {
+      // It's possible that the mask has a different number of elements from
+      // the original cast. We recompute the destination type to match the mask.
+      Type *DestTy =
+          VectorType::get(I->getType()->getScalarType(),
+                          NewOps[0]->getType()->getVectorNumElements());
+      assert(NewOps.size() == 1 && "cast with #ops != 1");
+      return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy,
+                              "", I);
+    }
+    case Instruction::GetElementPtr: {
+      Value *Ptr = NewOps[0];
+      ArrayRef<Value*> Idx = NewOps.slice(1);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(Ptr, Idx, "", I);
+      GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
+      return GEP;
+    }
+  }
+  llvm_unreachable("failed to rebuild vector instructions");
+}
+
+Value *
+InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
+  // Mask.size() does not need to be equal to the number of vector elements.
+
+  assert(V->getType()->isVectorTy() && "can't reorder non-vector elements");
+  if (isa<UndefValue>(V)) {
+    return UndefValue::get(VectorType::get(V->getType()->getScalarType(),
+                                           Mask.size()));
+  }
+  if (isa<ConstantAggregateZero>(V)) {
+    return ConstantAggregateZero::get(
+               VectorType::get(V->getType()->getScalarType(),
+                               Mask.size()));
+  }
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    SmallVector<Constant *, 16> MaskValues;
+    for (int i = 0, e = Mask.size(); i != e; ++i) {
+      if (Mask[i] == -1)
+        MaskValues.push_back(UndefValue::get(Builder->getInt32Ty()));
+      else
+        MaskValues.push_back(Builder->getInt32(Mask[i]));
+    }
+    return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
+                                          ConstantVector::get(MaskValues));
+  }
+
+  Instruction *I = cast<Instruction>(V);
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::Select:
+    case Instruction::GetElementPtr: {
+      SmallVector<Value*, 8> NewOps;
+      bool NeedsRebuild = (Mask.size() != I->getType()->getVectorNumElements());
+      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
+        Value *V = EvaluateInDifferentElementOrder(I->getOperand(i), Mask);
+        NewOps.push_back(V);
+        NeedsRebuild |= (V != I->getOperand(i));
+      }
+      if (NeedsRebuild) {
+        return BuildNew(I, NewOps);
+      }
+      return I;
+    }
+    case Instruction::InsertElement: {
+      int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue();
+
+      // The insertelement was inserting at Element. Figure out which element
+      // that becomes after shuffling. The answer is guaranteed to be unique
+      // by CanEvaluateShuffled.
+      bool Found = false;
+      int Index = 0;
+      for (int e = Mask.size(); Index != e; ++Index) {
+        if (Mask[Index] == Element) {
+          Found = true;
+          break;
+        }
+      }
+
+      if (!Found)
+        return UndefValue::get(I->getType());
+      Value *V = EvaluateInDifferentElementOrder(I->getOperand(0), Mask);
+      return InsertElementInst::Create(V, I->getOperand(1),
+                                       Builder->getInt32(Index), "", I);
+    }
+  }
+  llvm_unreachable("failed to reorder elements of vector instruction!");
+}
 
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
@@ -527,9 +771,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (LHS == RHS || isa<UndefValue>(LHS)) {
     if (isa<UndefValue>(LHS) && LHS == RHS) {
       // shuffle(undef,undef,mask) -> undef.
-      Value* result = (VWidth == LHSWidth)
+      Value *Result = (VWidth == LHSWidth)
                       ? LHS : UndefValue::get(SVI.getType());
-      return ReplaceInstUsesWith(SVI, result);
+      return ReplaceInstUsesWith(SVI, Result);
     }
 
     // Remap any references to RHS to use LHS.
@@ -576,6 +820,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
   }
 
+  if (isa<UndefValue>(RHS) && CanEvaluateShuffled(LHS, Mask)) {
+    Value *V = EvaluateInDifferentElementOrder(LHS, Mask);
+    return ReplaceInstUsesWith(SVI, V);
+  }
+
   // If the LHS is a shufflevector itself, see if we can combine it with this
   // one without producing an unusual shuffle.
   // Cases that might be simplified:
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index c6115e3..ec10751 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1483,7 +1483,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
       Module *M = II->getParent()->getParent()->getParent();
       Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
       InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
-                         ArrayRef<Value *>(), "", II->getParent());
+                         None, "", II->getParent());
     }
     return EraseInstFromFunction(MI);
   }
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 623c470..22851b4 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -39,9 +39,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -56,6 +56,7 @@ static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
 static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
 static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000;  // < 2G.
 static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
+static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa8000;
 
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
@@ -208,6 +209,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize,
   bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX;
   bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64;
   bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+  bool IsMIPS32 = TargetTriple.getArch() == llvm::Triple::mips ||
+                  TargetTriple.getArch() == llvm::Triple::mipsel;
 
   ShadowMapping Mapping;
 
@@ -217,7 +220,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize,
   Mapping.OrShadowOffset = !IsPPC64 && !ClShort64BitOffset;
 
   Mapping.Offset = (IsAndroid || ZeroBaseShadow) ? 0 :
-      (LongSize == 32 ? kDefaultShadowOffset32 :
+      (LongSize == 32 ?
+       (IsMIPS32 ? kMIPS32_ShadowOffset32 : kDefaultShadowOffset32) :
        IsPPC64 ? kPPC64_ShadowOffset64 : kDefaultShadowOffset64);
   if (!ZeroBaseShadow && ClShort64BitOffset && IsX86_64 && !IsMacOSX) {
     assert(LongSize == 64);
@@ -520,7 +524,7 @@ ModulePass *llvm::createAddressSanitizerModulePass(
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
-  size_t Res = CountTrailingZeros_32(TypeSize / 8);
+  size_t Res = countTrailingZeros(TypeSize / 8);
   assert(Res < kNumberOfAccessSizes);
   return Res;
 }
@@ -1270,6 +1274,10 @@ void FunctionStackPoisoner::poisonRedZones(
                                         RedzoneSize(),
                                         1ULL << Mapping.Scale,
                                         kAsanStackPartialRedzoneMagic);
+        Poison =
+            ASan.TD->isLittleEndian()
+                ? support::endian::byte_swap<uint32_t, support::little>(Poison)
+                : support::endian::byte_swap<uint32_t, support::big>(Poison);
       }
       Value *PartialPoison = ConstantInt::get(RZTy, Poison);
       IRB.CreateStore(PartialPoison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 1c9e053..aa265a4 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
   BlackList.cpp
   BoundsChecking.cpp
+  DebugIR.cpp
   EdgeProfiling.cpp
   GCOVProfiling.cpp
   MemorySanitizer.cpp
diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp
new file mode 100644
index 0000000..020804f
--- /dev/null
+++ b/lib/Transforms/Instrumentation/DebugIR.cpp
@@ -0,0 +1,310 @@
+//===--- DebugIR.cpp - Transform debug metadata to allow debugging IR -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A Module transform pass that emits a succinct version of the IR and replaces
+// the source file metadata to allow debuggers to step through the IR.
+//
+// The location where the IR file is emitted is the same as the directory
+// operand of the !llvm.dbg.cu metadata node present in the input module. The
+// file name is constructed from the original file name by stripping the
+// extension and replacing it with "-debug-ll" or the Postfix string specified
+// at construction.
+//
+// FIXME: instead of replacing debug metadata, additional metadata should be
+// used to point capable debuggers to the IR file without destroying the
+// mapping to the original source file.
+//
+// FIXME: this pass should not depend on the existance of debug metadata in
+// the module as it does now. Instead, it should use DIBuilder to create the
+// required metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include <string>
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/Assembly/AssemblyAnnotationWriter.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+namespace {
+
+/// Builds a map of Value* to line numbers on which the Value appears in a
+/// textual representation of the IR by plugging into the AssemblyWriter by
+/// masquerading as an AssemblyAnnotationWriter.
+class ValueToLineMap : public AssemblyAnnotationWriter {
+  ValueMap<const Value *, unsigned int> Lines;
+  typedef ValueMap<const Value *, unsigned int>::const_iterator LineIter;
+
+public:
+
+  /// Prints Module to a null buffer in order to build the map of Value pointers
+  /// to line numbers.
+  ValueToLineMap(Module *M) {
+    raw_null_ostream ThrowAway;
+    M->print(ThrowAway, this);
+  }
+
+  // This function is called after an Instruction, GlobalValue, or GlobalAlias
+  // is printed.
+  void printInfoComment(const Value &V, formatted_raw_ostream &Out) {
+    Out.flush();
+    Lines.insert(std::make_pair(&V, Out.getLine() + 1));
+  }
+
+  /// If V appears on a line in the textual IR representation, sets Line to the
+  /// line number and returns true, otherwise returns false.
+  bool getLine(const Value *V, unsigned int &Line) const {
+    LineIter i = Lines.find(V);
+    if (i != Lines.end()) {
+      Line = i->second;
+      return true;
+    }
+    return false;
+  }
+};
+
+/// Removes debug intrisncs like llvm.dbg.declare and llvm.dbg.value.
+class DebugIntrinsicsRemover : public InstVisitor<DebugIntrinsicsRemover> {
+  void remove(Instruction &I) { I.eraseFromParent(); }
+
+public:
+  void visitDbgDeclareInst(DbgDeclareInst &I) { remove(I); }
+  void visitDbgValueInst(DbgValueInst &I) { remove(I); }
+  void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { remove(I); }
+};
+
+/// Removes debug metadata (!dbg) nodes from all instructions as well as
+/// metadata named "llvm.dbg.cu" in the Module.
+class DebugMetadataRemover : public InstVisitor<DebugMetadataRemover> {
+public:
+  void visitInstruction(Instruction &I) {
+    if (I.getMetadata(LLVMContext::MD_dbg))
+      I.setMetadata(LLVMContext::MD_dbg, 0);
+  }
+
+  void run(Module *M) {
+    // Remove debug metadata attached to instructions
+    visit(M);
+
+    // Remove CU named metadata (and all children nodes)
+    NamedMDNode *Node = M->getNamedMetadata("llvm.dbg.cu");
+    M->eraseNamedMetadata(Node);
+  }
+};
+
+/// Replaces line number metadata attached to Instruction nodes with new line
+/// numbers provided by the ValueToLineMap.
+class LineNumberReplacer : public InstVisitor<LineNumberReplacer> {
+  /// Table of line numbers
+  const ValueToLineMap &LineTable;
+
+  /// Table of cloned values
+  const ValueToValueMapTy &VMap;
+
+  /// Directory of debug metadata
+  const DebugInfoFinder &Finder;
+
+public:
+  LineNumberReplacer(const ValueToLineMap &VLM, const DebugInfoFinder &Finder,
+                     const ValueToValueMapTy &VMap)
+      : LineTable(VLM), VMap(VMap), Finder(Finder) {}
+
+  void visitInstruction(Instruction &I) {
+    DebugLoc Loc(I.getDebugLoc());
+
+    unsigned Col = 0; // FIXME: support columns
+    unsigned Line;
+    if (!LineTable.getLine(VMap.lookup(&I), Line))
+      // Instruction has no line, it may have been removed (in the module that
+      // will be passed to the debugger) so there is nothing to do here.
+      return;
+
+    DebugLoc NewLoc;
+    if (!Loc.isUnknown())
+      // I had a previous debug location: re-use the DebugLoc
+      NewLoc = DebugLoc::get(Line, Col, Loc.getScope(I.getContext()),
+                             Loc.getInlinedAt(I.getContext()));
+    else if (MDNode *scope = findFunctionMD(I.getParent()->getParent()))
+      // I had no previous debug location, but M has some debug information
+      NewLoc =
+          DebugLoc::get(Line, Col, scope, /*FIXME: inlined instructions*/ 0);
+    else
+      // Neither I nor M has any debug information -- nothing to do here.
+      // FIXME: support debugging of undecorated IR (generated by clang without
+      //        the -g option)
+      return;
+
+    addDebugLocation(const_cast<Instruction &>(I), NewLoc);
+  }
+
+private:
+
+  /// Returns the MDNode that corresponds with F
+  MDNode *findFunctionMD(const Function *F) {
+    for (DebugInfoFinder::iterator i = Finder.subprogram_begin(),
+                                   e = Finder.subprogram_end();
+         i != e; ++i) {
+      DISubprogram S(*i);
+      if (S.getFunction() == F)
+        return *i;
+    }
+    // cannot find F -- likely means there is no debug information
+    return 0;
+  }
+
+  void addDebugLocation(Instruction &I, DebugLoc Loc) {
+    MDNode *MD = Loc.getAsMDNode(I.getContext());
+    I.setMetadata(LLVMContext::MD_dbg, MD);
+  }
+};
+
+class DebugIR : public ModulePass {
+  std::string Postfix;
+  std::string Filename;
+
+  /// Flags to control the verbosity of the generated IR file
+  bool hideDebugIntrinsics;
+  bool hideDebugMetadata;
+
+public:
+  static char ID;
+
+  const char *getPassName() const { return "DebugIR"; }
+
+  // FIXME: figure out if we are compiling something that already exists on disk
+  // in text IR form, in which case we can omit outputting a new IR file, or if
+  // we're building something from memory where we actually need to emit a new
+  // IR file for the debugger.
+
+  /// Output a file with the same base name as the original, but with the
+  /// postfix "-debug-ll" appended.
+  DebugIR()
+      : ModulePass(ID), Postfix("-debug-ll"), hideDebugIntrinsics(true),
+        hideDebugMetadata(true) {}
+
+  /// Customize the postfix string used to replace the extension of the
+  /// original filename that appears in the !llvm.dbg.cu metadata node.
+  DebugIR(StringRef postfix, bool hideDebugIntrinsics, bool hideDebugMetadata)
+      : ModulePass(ID), Postfix(postfix),
+        hideDebugIntrinsics(hideDebugIntrinsics),
+        hideDebugMetadata(hideDebugMetadata) {}
+
+private:
+  // Modify the filename embedded in the Compilation-Unit debug information of M
+  bool replaceFilename(Module &M, const DebugInfoFinder &Finder) {
+    bool changed = false;
+
+    // Sanity check -- if llvm.dbg.cu node exists, the DebugInfoFinder
+    // better have found at least one CU!
+    if (M.getNamedMetadata("llvm.dbg.cu"))
+      assert(Finder.compile_unit_count() > 0 &&
+             "Found no compile units but llvm.dbg.cu node exists");
+
+    for (DebugInfoFinder::iterator i = Finder.compile_unit_begin(),
+                                   e = Finder.compile_unit_end();
+         i != e; ++i) {
+      DICompileUnit CU(*i);
+      Filename = CU.getFilename();
+
+      // Replace extension with postfix
+      size_t dot = Filename.find_last_of(".");
+      if (dot != std::string::npos)
+        Filename.erase(dot);
+      Filename += Postfix;
+
+      CU.setFilename(Filename, M.getContext());
+      changed = true;
+    }
+    return changed;
+  }
+
+  /// Replace existing line number metadata with line numbers that correspond
+  /// with the IR file that is seen by the debugger.
+  void addLineNumberMetadata(Module *M, const ValueToLineMap &VLM,
+                             const ValueToValueMapTy &VMap,
+                             const DebugInfoFinder &Finder) {
+    LineNumberReplacer Replacer(VLM, Finder, VMap);
+    Replacer.visit(M);
+  }
+
+  void writeDebugBitcode(Module *M) {
+    std::string error;
+    tool_output_file OutFile(Filename.c_str(), error);
+    OutFile.keep();
+    formatted_raw_ostream OS;
+    OS.setStream(OutFile.os());
+    M->print(OS, 0);
+  }
+
+  void removeDebugIntrinsics(Module *M) {
+    DebugIntrinsicsRemover Remover;
+    Remover.visit(M);
+  }
+
+  void removeDebugMetadata(Module *M) {
+    DebugMetadataRemover Remover;
+    Remover.run(M);
+  }
+
+  void updateAndWriteDebugIRFile(Module *M, const DebugInfoFinder &Finder) {
+    // The module we output in text form for a debugger to open is stripped of
+    // 'extras' like debug intrinsics that end up in DWARF anyways and just
+    // clutter the debug experience.
+
+    ValueToValueMapTy VMap;
+    Module *DebuggerM = CloneModule(M, VMap);
+
+    if (hideDebugIntrinsics)
+      removeDebugIntrinsics(DebuggerM);
+
+    if (hideDebugMetadata)
+      removeDebugMetadata(DebuggerM);
+
+    // FIXME: remove all debug metadata from M once we support generating DWARF
+    // subprogram attributes.
+
+    ValueToLineMap LineTable(DebuggerM);
+    addLineNumberMetadata(M, LineTable, VMap, Finder);
+    writeDebugBitcode(DebuggerM);
+  }
+
+  bool runOnModule(Module &M) {
+    // Stores existing debug info needed when creating new line number entries.
+    DebugInfoFinder Finder;
+    Finder.processModule(M);
+
+    bool changed = replaceFilename(M, Finder);
+    if (changed)
+      updateAndWriteDebugIRFile(&M, Finder);
+    return changed;
+  }
+};
+
+} // anonymous namespace
+
+char DebugIR::ID = 0;
+INITIALIZE_PASS(DebugIR, "debug-ir", "Enable debugging IR", false, false)
+
+ModulePass *llvm::createDebugIRPass(StringRef FilenamePostfix,
+                                    bool hideDebugIntrinsics,
+                                    bool hideDebugMetadata) {
+  return new DebugIR(FilenamePostfix, hideDebugIntrinsics, hideDebugMetadata);
+}
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 2edd151..3ce9cf6 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -34,7 +34,7 @@
 #include "llvm/Support/DebugLoc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InstIterator.h"
-#include "llvm/Support/PathV2.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <string>
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4e75904..a3a688d 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -74,6 +74,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/ValueMap.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -228,7 +229,7 @@ class MemorySanitizer : public FunctionPass {
   MDNode *ColdCallWeights;
   /// \brief Branch weights for origin store.
   MDNode *OriginStoreWeights;
-  /// \bried Path to blacklist file.
+  /// \brief Path to blacklist file.
   SmallString<64> BlacklistFile;
   /// \brief The blacklist.
   OwningPtr<BlackList> BL;
@@ -299,30 +300,30 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   RetvalTLS = new GlobalVariable(
     M, ArrayType::get(IRB.getInt64Ty(), 8), false,
     GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0,
-    GlobalVariable::GeneralDynamicTLSModel);
+    GlobalVariable::InitialExecTLSModel);
   RetvalOriginTLS = new GlobalVariable(
     M, OriginTy, false, GlobalVariable::ExternalLinkage, 0,
-    "__msan_retval_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+    "__msan_retval_origin_tls", 0, GlobalVariable::InitialExecTLSModel);
 
   ParamTLS = new GlobalVariable(
     M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
     GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0,
-    GlobalVariable::GeneralDynamicTLSModel);
+    GlobalVariable::InitialExecTLSModel);
   ParamOriginTLS = new GlobalVariable(
     M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage,
-    0, "__msan_param_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+    0, "__msan_param_origin_tls", 0, GlobalVariable::InitialExecTLSModel);
 
   VAArgTLS = new GlobalVariable(
     M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
     GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0,
-    GlobalVariable::GeneralDynamicTLSModel);
+    GlobalVariable::InitialExecTLSModel);
   VAArgOverflowSizeTLS = new GlobalVariable(
     M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0,
     "__msan_va_arg_overflow_size_tls", 0,
-    GlobalVariable::GeneralDynamicTLSModel);
+    GlobalVariable::InitialExecTLSModel);
   OriginTLS = new GlobalVariable(
     M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0,
-    "__msan_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+    "__msan_origin_tls", 0, GlobalVariable::InitialExecTLSModel);
 
   // We insert an empty inline asm after __msan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
@@ -365,11 +366,13 @@ bool MemorySanitizer::doInitialization(Module &M) {
   appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction(
                       "__msan_init", IRB.getVoidTy(), NULL)), 0);
 
-  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
-                     IRB.getInt32(TrackOrigins), "__msan_track_origins");
+  if (TrackOrigins)
+    new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                       IRB.getInt32(TrackOrigins), "__msan_track_origins");
 
-  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
-                     IRB.getInt32(ClKeepGoing), "__msan_keep_going");
+  if (ClKeepGoing)
+    new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                       IRB.getInt32(ClKeepGoing), "__msan_keep_going");
 
   return true;
 }
@@ -768,14 +771,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           if (AI->hasByValAttr()) {
             // ByVal pointer itself has clean shadow. We copy the actual
             // argument shadow to the underlying memory.
+            // Figure out maximal valid memcpy alignment.
+            unsigned ArgAlign = AI->getParamAlignment();
+            if (ArgAlign == 0) {
+              Type *EltType = A->getType()->getPointerElementType();
+              ArgAlign = MS.TD->getABITypeAlignment(EltType);
+            }
+            unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
             Value *Cpy = EntryIRB.CreateMemCpy(
-              getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB),
-              Base, Size, AI->getParamAlignment());
+                getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), Base, Size,
+                CopyAlign);
             DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
             (void)Cpy;
             *ShadowPtr = getCleanShadow(V);
           } else {
-            *ShadowPtr = EntryIRB.CreateLoad(Base);
+            *ShadowPtr = EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
           }
           DEBUG(dbgs() << "  ARG:    "  << *AI << " ==> " <<
                 **ShadowPtr << "\n");
@@ -784,7 +794,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
           }
         }
-        ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
+        ArgOffset += DataLayout::RoundUpAlignment(Size, kShadowTLSAlignment);
       }
       assert(*ShadowPtr && "Could not find shadow for an argument");
       return *ShadowPtr;
@@ -1963,9 +1973,29 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 };
 
-VarArgHelper* CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+/// \brief A no-op implementation of VarArgHelper.
+struct VarArgNoOpHelper : public VarArgHelper {
+  VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
+                   MemorySanitizerVisitor &MSV) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) {}
+
+  void visitVAStartInst(VAStartInst &I) {}
+
+  void visitVACopyInst(VACopyInst &I) {}
+
+  void finalizeInstrumentation() {}
+};
+
+VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
                                  MemorySanitizerVisitor &Visitor) {
-  return new VarArgAMD64Helper(Func, Msan, Visitor);
+  // VarArg handling is only implemented on AMD64. False positives are possible
+  // on other platforms.
+  llvm::Triple TargetTriple(Func.getParent()->getTargetTriple());
+  if (TargetTriple.getArch() == llvm::Triple::x86_64)
+    return new VarArgAMD64Helper(Func, Msan, Visitor);
+  else
+    return new VarArgNoOpHelper(Func, Msan, Visitor);
 }
 
 }  // namespace
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 299060a..318fa3f 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -579,7 +579,7 @@ int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) {
     // Ignore all unusual sizes.
     return -1;
   }
-  size_t Idx = CountTrailingZeros_32(TypeSize / 8);
+  size_t Idx = countTrailingZeros(TypeSize / 8);
   assert(Idx < kNumberOfAccessSizes);
   return Idx;
 }
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index e3d51d5..fc5cf4e 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -30,6 +30,7 @@
 #include "ObjCARCAliasAnalysis.h"
 #include "ProvenanceAnalysis.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -107,6 +108,12 @@ namespace {
       return std::make_pair(Vector.begin() + Pair.first->second, false);
     }
 
+    iterator find(const KeyT &Key) {
+      typename MapTy::iterator It = Map.find(Key);
+      if (It == Map.end()) return Vector.end();
+      return Vector.begin() + It->second;
+    }
+
     const_iterator find(const KeyT &Key) const {
       typename MapTy::const_iterator It = Map.find(Key);
       if (It == Map.end()) return Vector.end();
@@ -253,6 +260,40 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) {
   return false;
 }
 
+/// This is a wrapper around getUnderlyingObjCPtr along the lines of
+/// GetUnderlyingObjects except that it returns early when it sees the first
+/// alloca.
+static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V) {
+  SmallPtrSet<const Value *, 4> Visited;
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(V);
+  do {
+    const Value *P = Worklist.pop_back_val();
+    P = GetUnderlyingObjCPtr(P);
+
+    if (isa<AllocaInst>(P))
+      return true;
+
+    if (!Visited.insert(P))
+      continue;
+
+    if (const SelectInst *SI = dyn_cast<const SelectInst>(P)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+
+    if (const PHINode *PN = dyn_cast<const PHINode>(P)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        Worklist.push_back(PN->getIncomingValue(i));
+      continue;
+    }
+  } while (!Worklist.empty());
+
+  return false;
+}
+
+
 /// @}
 ///
 /// \defgroup ARCOpt ARC Optimization.
@@ -300,18 +341,18 @@ STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
 STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
 STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
 STATISTIC(NumRets,        "Number of return value forwarding "
-                          "retain+autoreleaes eliminated");
+                          "retain+autoreleases eliminated");
 STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
 STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+#ifndef NDEBUG
 STATISTIC(NumRetainsBeforeOpt,
-          "Number of retains before optimization.");
+          "Number of retains before optimization");
 STATISTIC(NumReleasesBeforeOpt,
-          "Number of releases before optimization.");
-#ifndef NDEBUG
+          "Number of releases before optimization");
 STATISTIC(NumRetainsAfterOpt,
-          "Number of retains after optimization.");
+          "Number of retains after optimization");
 STATISTIC(NumReleasesAfterOpt,
-          "Number of releases after optimization.");
+          "Number of releases after optimization");
 #endif
 
 namespace {
@@ -414,8 +455,13 @@ namespace {
     /// sequence.
     SmallPtrSet<Instruction *, 2> ReverseInsertPts;
 
+    /// If this is true, we cannot perform code motion but can still remove
+    /// retain/release pairs.
+    bool CFGHazardAfflicted;
+
     RRInfo() :
-      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0) {}
+      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0),
+      CFGHazardAfflicted(false) {}
 
     void clear();
 
@@ -431,6 +477,7 @@ void RRInfo::clear() {
   ReleaseMetadata = 0;
   Calls.clear();
   ReverseInsertPts.clear();
+  CFGHazardAfflicted = false;
 }
 
 namespace {
@@ -457,10 +504,12 @@ namespace {
                  Seq(S_None) {}
 
     void SetKnownPositiveRefCount() {
+      DEBUG(dbgs() << "Setting Known Positive.\n");
       KnownPositiveRefCount = true;
     }
 
     void ClearKnownPositiveRefCount() {
+      DEBUG(dbgs() << "Clearing Known Positive.\n");
       KnownPositiveRefCount = false;
     }
 
@@ -516,6 +565,7 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
     RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
                             Other.RRI.IsTailCallRelease;
     RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
+    RRI.CFGHazardAfflicted |= Other.RRI.CFGHazardAfflicted;
 
     // Merge the insert point sets. If there are any differences,
     // that makes this a partial merge.
@@ -587,14 +637,26 @@ namespace {
     /// definition.
     void SetAsExit()  { BottomUpPathCount = 1; }
 
+    /// Attempt to find the PtrState object describing the top down state for
+    /// pointer Arg. Return a new initialized PtrState describing the top down
+    /// state for Arg if we do not find one.
     PtrState &getPtrTopDownState(const Value *Arg) {
       return PerPtrTopDown[Arg];
     }
 
+    /// Attempt to find the PtrState object describing the bottom up state for
+    /// pointer Arg. Return a new initialized PtrState describing the bottom up
+    /// state for Arg if we do not find one.
     PtrState &getPtrBottomUpState(const Value *Arg) {
       return PerPtrBottomUp[Arg];
     }
 
+    /// Attempt to find the PtrState object describing the bottom up state for
+    /// pointer Arg.
+    ptr_iterator findPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp.find(Arg);
+    }
+
     void clearBottomUpPointers() {
       PerPtrBottomUp.clear();
     }
@@ -608,13 +670,20 @@ namespace {
     void MergePred(const BBState &Other);
     void MergeSucc(const BBState &Other);
 
-    /// Return the number of possible unique paths from an entry to an exit
+    /// Compute the number of possible unique paths from an entry to an exit
     /// which pass through this block. This is only valid after both the
     /// top-down and bottom-up traversals are complete.
-    unsigned GetAllPathCount() const {
+    ///
+    /// Returns true if overflow occured. Returns false if overflow did not
+    /// occur.
+    bool GetAllPathCountWithOverflow(unsigned &PathCount) const {
       assert(TopDownPathCount != 0);
       assert(BottomUpPathCount != 0);
-      return TopDownPathCount * BottomUpPathCount;
+      unsigned long long Product =
+        (unsigned long long)TopDownPathCount*BottomUpPathCount;
+      PathCount = Product;
+      // Overflow occured if any of the upper bits of Product are set.
+      return Product >> 32;
     }
 
     // Specialized CFG utilities.
@@ -992,6 +1061,9 @@ namespace {
     bool Changed;
     ProvenanceAnalysis PA;
 
+    // This is used to track if a pointer is stored into an alloca.
+    DenseSet<const Value *> MultiOwnersSet;
+
     /// A flag indicating whether this optimization pass should run.
     bool Run;
 
@@ -1440,11 +1512,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_RetainBlock:
       // If we strength reduce an objc_retainBlock to an objc_retain, continue
       // onto the objc_retain peephole optimizations. Otherwise break.
-      if (!OptimizeRetainBlockCall(F, Inst, Class))
-        break;
-      // FALLTHROUGH
-    case IC_Retain:
-      ++NumRetainsBeforeOpt;
+      OptimizeRetainBlockCall(F, Inst, Class);
       break;
     case IC_RetainRV:
       if (OptimizeRetainRVCall(F, Inst))
@@ -1453,9 +1521,6 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_AutoreleaseRV:
       OptimizeAutoreleaseRVCall(F, Inst, Class);
       break;
-    case IC_Release:
-      ++NumReleasesBeforeOpt;
-      break;
     }
 
     // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
@@ -1472,8 +1537,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
         CallInst *NewCall =
           CallInst::Create(getReleaseCallee(F.getParent()),
                            Call->getArgOperand(0), "", Call);
-        NewCall->setMetadata(ImpreciseReleaseMDKind,
-                             MDNode::get(C, ArrayRef<Value *>()));
+        NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, None));
 
         DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
               "since x is otherwise unused.\nOld: " << *Call << "\nNew: "
@@ -1640,6 +1704,7 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq,
                                  PtrState &S,
                                  bool &SomeSuccHasSame,
                                  bool &AllSuccsHaveSame,
+                                 bool &NotAllSeqEqualButKnownSafe,
                                  bool &ShouldContinue) {
   switch (SuccSSeq) {
   case S_CanRelease: {
@@ -1647,6 +1712,7 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq,
       S.ClearSequenceProgress();
       break;
     }
+    S.RRI.CFGHazardAfflicted = true;
     ShouldContinue = true;
     break;
   }
@@ -1658,6 +1724,8 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq,
   case S_MovableRelease:
     if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
       AllSuccsHaveSame = false;
+    else
+      NotAllSeqEqualButKnownSafe = true;
     break;
   case S_Retain:
     llvm_unreachable("bottom-up pointer in retain state!");
@@ -1673,7 +1741,8 @@ static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
                                         const bool SuccSRRIKnownSafe,
                                         PtrState &S,
                                         bool &SomeSuccHasSame,
-                                        bool &AllSuccsHaveSame) {
+                                        bool &AllSuccsHaveSame,
+                                        bool &NotAllSeqEqualButKnownSafe) {
   switch (SuccSSeq) {
   case S_CanRelease:
     SomeSuccHasSame = true;
@@ -1684,6 +1753,8 @@ static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
   case S_Use:
     if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
       AllSuccsHaveSame = false;
+    else
+      NotAllSeqEqualButKnownSafe = true;
     break;
   case S_Retain:
     llvm_unreachable("bottom-up pointer in retain state!");
@@ -1719,6 +1790,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
     const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
     bool SomeSuccHasSame = false;
     bool AllSuccsHaveSame = true;
+    bool NotAllSeqEqualButKnownSafe = false;
 
     succ_const_iterator SI(TI), SE(TI, false);
 
@@ -1750,17 +1822,17 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       switch(S.GetSeq()) {
       case S_Use: {
         bool ShouldContinue = false;
-        CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
-                             SomeSuccHasSame, AllSuccsHaveSame,
+        CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, SomeSuccHasSame,
+                             AllSuccsHaveSame, NotAllSeqEqualButKnownSafe,
                              ShouldContinue);
         if (ShouldContinue)
           continue;
         break;
       }
       case S_CanRelease: {
-        CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe,
-                                    S, SomeSuccHasSame,
-                                    AllSuccsHaveSame);
+        CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
+                                    SomeSuccHasSame, AllSuccsHaveSame,
+                                    NotAllSeqEqualButKnownSafe);
         break;
       }
       case S_Retain:
@@ -1775,8 +1847,15 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
     // If the state at the other end of any of the successor edges
     // matches the current state, require all edges to match. This
     // guards against loops in the middle of a sequence.
-    if (SomeSuccHasSame && !AllSuccsHaveSame)
+    if (SomeSuccHasSame && !AllSuccsHaveSame) {
       S.ClearSequenceProgress();
+    } else if (NotAllSeqEqualButKnownSafe) {
+      // If we would have cleared the state foregoing the fact that we are known
+      // safe, stop code motion. This is because whether or not it is safe to
+      // remove RR pairs via KnownSafe is an orthogonal concept to whether we
+      // are allowed to perform code motion.
+      S.RRI.CFGHazardAfflicted = true;
+    }
   }
 }
 
@@ -1867,6 +1946,28 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
   case IC_None:
     // These are irrelevant.
     return NestingDetected;
+  case IC_User:
+    // If we have a store into an alloca of a pointer we are tracking, the
+    // pointer has multiple owners implying that we must be more conservative.
+    //
+    // This comes up in the context of a pointer being ``KnownSafe''. In the
+    // presense of a block being initialized, the frontend will emit the
+    // objc_retain on the original pointer and the release on the pointer loaded
+    // from the alloca. The optimizer will through the provenance analysis
+    // realize that the two are related, but since we only require KnownSafe in
+    // one direction, will match the inner retain on the original pointer with
+    // the guard release on the original pointer. This is fixed by ensuring that
+    // in the presense of allocas we only unconditionally remove pointers if
+    // both our retain and our release are KnownSafe.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand())) {
+        BBState::ptr_iterator I = MyStates.findPtrBottomUpState(
+          StripPointerCastsAndObjCCalls(SI->getValueOperand()));
+        if (I != MyStates.bottom_up_ptr_end())
+          MultiOwnersSet.insert(I->first);
+      }
+    }
+    break;
   default:
     break;
   }
@@ -2413,8 +2514,11 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                                   bool KnownSafe,
                                   bool &AnyPairsCompletelyEliminated) {
   // If a pair happens in a region where it is known that the reference count
-  // is already incremented, we can similarly ignore possible decrements.
+  // is already incremented, we can similarly ignore possible decrements unless
+  // we are dealing with a retainable object with multiple provenance sources.
   bool KnownSafeTD = true, KnownSafeBU = true;
+  bool MultipleOwners = false;
+  bool CFGHazardAfflicted = false;
 
   // Connect the dots between the top-down-collected RetainsToMove and
   // bottom-up-collected ReleasesToMove to form sets of related calls.
@@ -2433,6 +2537,8 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
       assert(It != Retains.end());
       const RRInfo &NewRetainRRI = It->second;
       KnownSafeTD &= NewRetainRRI.KnownSafe;
+      MultipleOwners =
+        MultipleOwners || MultiOwnersSet.count(GetObjCArg(NewRetain));
       for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewRetainRRI.Calls.begin(),
              LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
@@ -2444,8 +2550,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
         const RRInfo &NewRetainReleaseRRI = Jt->second;
         assert(NewRetainReleaseRRI.Calls.count(NewRetain));
         if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
-          OldDelta -=
-            BBStates[NewRetainRelease->getParent()].GetAllPathCount();
+
+          // If we overflow when we compute the path count, don't remove/move
+          // anything.
+          const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()];
+          unsigned PathCount;
+          if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+            return false;
+          OldDelta -= PathCount;
 
           // Merge the ReleaseMetadata and IsTailCallRelease values.
           if (FirstRelease) {
@@ -2470,8 +2582,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                    RE = NewRetainReleaseRRI.ReverseInsertPts.end();
                  RI != RE; ++RI) {
               Instruction *RIP = *RI;
-              if (ReleasesToMove.ReverseInsertPts.insert(RIP))
-                NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
+              if (ReleasesToMove.ReverseInsertPts.insert(RIP)) {
+                // If we overflow when we compute the path count, don't
+                // remove/move anything.
+                const BBState &RIPBBState = BBStates[RIP->getParent()];
+                if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+                  return false;
+                NewDelta -= PathCount;
+              }
             }
           NewReleases.push_back(NewRetainRelease);
         }
@@ -2489,6 +2607,7 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
       assert(It != Releases.end());
       const RRInfo &NewReleaseRRI = It->second;
       KnownSafeBU &= NewReleaseRRI.KnownSafe;
+      CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted;
       for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewReleaseRRI.Calls.begin(),
              LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
@@ -2500,8 +2619,13 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
         const RRInfo &NewReleaseRetainRRI = Jt->second;
         assert(NewReleaseRetainRRI.Calls.count(NewRelease));
         if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
-          unsigned PathCount =
-            BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
+
+          // If we overflow when we compute the path count, don't remove/move
+          // anything.
+          const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()];
+          unsigned PathCount;
+          if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+            return false;
           OldDelta += PathCount;
           OldCount += PathCount;
 
@@ -2513,7 +2637,11 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                  RI != RE; ++RI) {
               Instruction *RIP = *RI;
               if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
-                PathCount = BBStates[RIP->getParent()].GetAllPathCount();
+                // If we overflow when we compute the path count, don't
+                // remove/move anything.
+                const BBState &RIPBBState = BBStates[RIP->getParent()];
+                if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+                  return false;
                 NewDelta += PathCount;
                 NewCount += PathCount;
               }
@@ -2526,9 +2654,12 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
     if (NewRetains.empty()) break;
   }
 
-  // If the pointer is known incremented or nested, we can safely delete the
-  // pair regardless of what's between them.
-  if (KnownSafeTD || KnownSafeBU) {
+  // If the pointer is known incremented in 1 direction and we do not have
+  // MultipleOwners, we can safely remove the retain/releases. Otherwise we need
+  // to be known safe in both directions.
+  bool UnconditionallySafe = (KnownSafeTD && KnownSafeBU) ||
+    ((KnownSafeTD || KnownSafeBU) && !MultipleOwners);
+  if (UnconditionallySafe) {
     RetainsToMove.ReverseInsertPts.clear();
     ReleasesToMove.ReverseInsertPts.clear();
     NewCount = 0;
@@ -2539,6 +2670,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
     // less aggressive solution which is.
     if (NewDelta != 0)
       return false;
+
+    // At this point, we are not going to remove any RR pairs, but we still are
+    // able to move RR pairs. If one of our pointers is afflicted with
+    // CFGHazards, we cannot perform such code motion so exit early.
+    const bool WillPerformCodeMotion = RetainsToMove.ReverseInsertPts.size() ||
+      ReleasesToMove.ReverseInsertPts.size();
+    if (CFGHazardAfflicted && WillPerformCodeMotion)
+      return false;
   }
 
   // Determine whether the original call points are balanced in the retain and
@@ -2802,23 +2941,29 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
 /// Identify program paths which execute sequences of retains and releases which
 /// can be eliminated.
 bool ObjCARCOpt::OptimizeSequences(Function &F) {
-  /// Releases, Retains - These are used to store the results of the main flow
-  /// analysis. These use Value* as the key instead of Instruction* so that the
-  /// map stays valid when we get around to rewriting code and calls get
-  /// replaced by arguments.
+  // Releases, Retains - These are used to store the results of the main flow
+  // analysis. These use Value* as the key instead of Instruction* so that the
+  // map stays valid when we get around to rewriting code and calls get
+  // replaced by arguments.
   DenseMap<Value *, RRInfo> Releases;
   MapVector<Value *, RRInfo> Retains;
 
-  /// This is used during the traversal of the function to track the
-  /// states for each identified object at each block.
+  // This is used during the traversal of the function to track the
+  // states for each identified object at each block.
   DenseMap<const BasicBlock *, BBState> BBStates;
 
   // Analyze the CFG of the function, and all instructions.
   bool NestingDetected = Visit(F, BBStates, Retains, Releases);
 
   // Transform.
-  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
-         NestingDetected;
+  bool AnyPairsCompletelyEliminated = PerformCodePlacement(BBStates, Retains,
+                                                           Releases,
+                                                           F.getParent());
+
+  // Cleanup.
+  MultiOwnersSet.clear();
+
+  return AnyPairsCompletelyEliminated && NestingDetected;
 }
 
 /// Check if there is a dependent call earlier that does not have anything in
@@ -3051,6 +3196,12 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
 
   PA.setAA(&getAnalysis<AliasAnalysis>());
 
+#ifndef NDEBUG
+  if (AreStatisticsEnabled()) {
+    GatherStatistics(F, false);
+  }
+#endif
+
   // This pass performs several distinct transformations. As a compile-time aid
   // when compiling code that isn't ObjC, skip these if the relevant ObjC
   // library functions aren't declared.
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 615c517..f0d29c8 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/ValueMap.h"
 #include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -88,7 +89,7 @@ namespace {
     /// Keeps track of non-local addresses that have been sunk into a block.
     /// This allows us to avoid inserting duplicate code for blocks with
     /// multiple load/stores of the same address.
-    DenseMap<Value*, Value*> SunkAddrs;
+    ValueMap<Value*, Value*> SunkAddrs;
 
     /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to
     /// be updated.
@@ -1653,10 +1654,6 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       // start of the block.
       CurInstIterator = BB->begin();
       SunkAddrs.clear();
-    } else {
-      // This address is now available for reassignment, so erase the table
-      // entry; we don't want to match some completely different instruction.
-      SunkAddrs[Addr] = 0;
     }
   }
   ++NumMemoryInsts;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 129af8d..996996d 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <vector>
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -498,6 +499,75 @@ void ValueTable::verifyRemoved(const Value *V) const {
 //===----------------------------------------------------------------------===//
 
 namespace {
+  class GVN;
+  struct AvailableValueInBlock {
+    /// BB - The basic block in question.
+    BasicBlock *BB;
+    enum ValType {
+      SimpleVal,  // A simple offsetted value that is accessed.
+      LoadVal,    // A value produced by a load.
+      MemIntrin   // A memory intrinsic which is loaded from.
+    };
+  
+    /// V - The value that is live out of the block.
+    PointerIntPair<Value *, 2, ValType> Val;
+  
+    /// Offset - The byte offset in Val that is interesting for the load query.
+    unsigned Offset;
+  
+    static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+                                     unsigned Offset = 0) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(V);
+      Res.Val.setInt(SimpleVal);
+      Res.Offset = Offset;
+      return Res;
+    }
+  
+    static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI,
+                                       unsigned Offset = 0) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(MI);
+      Res.Val.setInt(MemIntrin);
+      Res.Offset = Offset;
+      return Res;
+    }
+  
+    static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
+                                         unsigned Offset = 0) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(LI);
+      Res.Val.setInt(LoadVal);
+      Res.Offset = Offset;
+      return Res;
+    }
+  
+    bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+    bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+    bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+  
+    Value *getSimpleValue() const {
+      assert(isSimpleValue() && "Wrong accessor");
+      return Val.getPointer();
+    }
+  
+    LoadInst *getCoercedLoadValue() const {
+      assert(isCoercedLoadValue() && "Wrong accessor");
+      return cast<LoadInst>(Val.getPointer());
+    }
+  
+    MemIntrinsic *getMemIntrinValue() const {
+      assert(isMemIntrinValue() && "Wrong accessor");
+      return cast<MemIntrinsic>(Val.getPointer());
+    }
+  
+    /// MaterializeAdjustedValue - Emit code into this block to adjust the value
+    /// defined here to the specified type.  This handles various coercion cases.
+    Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const;
+  };
 
   class GVN : public FunctionPass {
     bool NoLoads;
@@ -519,6 +589,11 @@ namespace {
     BumpPtrAllocator TableAllocator;
 
     SmallVector<Instruction*, 8> InstrsToErase;
+
+    typedef SmallVector<NonLocalDepResult, 64> LoadDepVect;
+    typedef SmallVector<AvailableValueInBlock, 64> AvailValInBlkVect;
+    typedef SmallVector<BasicBlock*, 64> UnavailBlkVect;
+
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit GVN(bool noloads = false)
@@ -599,11 +674,17 @@ namespace {
     }
 
 
-    // Helper fuctions
-    // FIXME: eliminate or document these better
+    // Helper fuctions of redundant load elimination 
     bool processLoad(LoadInst *L);
-    bool processInstruction(Instruction *I);
     bool processNonLocalLoad(LoadInst *L);
+    void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
+                                 AvailValInBlkVect &ValuesPerBlock,
+                                 UnavailBlkVect &UnavailableBlocks);
+    bool PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
+                        UnavailBlkVect &UnavailableBlocks);
+
+    // Other helper routines
+    bool processInstruction(Instruction *I);
     bool processBlock(BasicBlock *BB);
     void dump(DenseMap<uint32_t, Value*> &d);
     bool iterateOnFunction(Function &F);
@@ -612,6 +693,7 @@ namespace {
     void cleanupGlobalSets();
     void verifyRemoved(const Instruction *I) const;
     bool splitCriticalEdges();
+    BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
     unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
                                          const BasicBlockEdge &Root);
     bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root);
@@ -1159,114 +1241,6 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   return ConstantFoldLoadFromConstPtr(Src, &TD);
 }
 
-namespace {
-
-struct AvailableValueInBlock {
-  /// BB - The basic block in question.
-  BasicBlock *BB;
-  enum ValType {
-    SimpleVal,  // A simple offsetted value that is accessed.
-    LoadVal,    // A value produced by a load.
-    MemIntrin   // A memory intrinsic which is loaded from.
-  };
-
-  /// V - The value that is live out of the block.
-  PointerIntPair<Value *, 2, ValType> Val;
-
-  /// Offset - The byte offset in Val that is interesting for the load query.
-  unsigned Offset;
-
-  static AvailableValueInBlock get(BasicBlock *BB, Value *V,
-                                   unsigned Offset = 0) {
-    AvailableValueInBlock Res;
-    Res.BB = BB;
-    Res.Val.setPointer(V);
-    Res.Val.setInt(SimpleVal);
-    Res.Offset = Offset;
-    return Res;
-  }
-
-  static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI,
-                                     unsigned Offset = 0) {
-    AvailableValueInBlock Res;
-    Res.BB = BB;
-    Res.Val.setPointer(MI);
-    Res.Val.setInt(MemIntrin);
-    Res.Offset = Offset;
-    return Res;
-  }
-
-  static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
-                                       unsigned Offset = 0) {
-    AvailableValueInBlock Res;
-    Res.BB = BB;
-    Res.Val.setPointer(LI);
-    Res.Val.setInt(LoadVal);
-    Res.Offset = Offset;
-    return Res;
-  }
-
-  bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
-  bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
-  bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
-
-  Value *getSimpleValue() const {
-    assert(isSimpleValue() && "Wrong accessor");
-    return Val.getPointer();
-  }
-
-  LoadInst *getCoercedLoadValue() const {
-    assert(isCoercedLoadValue() && "Wrong accessor");
-    return cast<LoadInst>(Val.getPointer());
-  }
-
-  MemIntrinsic *getMemIntrinValue() const {
-    assert(isMemIntrinValue() && "Wrong accessor");
-    return cast<MemIntrinsic>(Val.getPointer());
-  }
-
-  /// MaterializeAdjustedValue - Emit code into this block to adjust the value
-  /// defined here to the specified type.  This handles various coercion cases.
-  Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
-    Value *Res;
-    if (isSimpleValue()) {
-      Res = getSimpleValue();
-      if (Res->getType() != LoadTy) {
-        const DataLayout *TD = gvn.getDataLayout();
-        assert(TD && "Need target data to handle type mismatch case");
-        Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
-                                   *TD);
-
-        DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
-                     << *getSimpleValue() << '\n'
-                     << *Res << '\n' << "\n\n\n");
-      }
-    } else if (isCoercedLoadValue()) {
-      LoadInst *Load = getCoercedLoadValue();
-      if (Load->getType() == LoadTy && Offset == 0) {
-        Res = Load;
-      } else {
-        Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
-                                  gvn);
-
-        DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
-                     << *getCoercedLoadValue() << '\n'
-                     << *Res << '\n' << "\n\n\n");
-      }
-    } else {
-      const DataLayout *TD = gvn.getDataLayout();
-      assert(TD && "Need target data to handle type mismatch case");
-      Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
-                                   LoadTy, BB->getTerminator(), *TD);
-      DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
-                   << "  " << *getMemIntrinValue() << '\n'
-                   << *Res << '\n' << "\n\n\n");
-    }
-    return Res;
-  }
-};
-
-} // end anonymous namespace
 
 /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
 /// construct SSA form, allowing us to eliminate LI.  This returns the value
@@ -1323,48 +1297,59 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   return V;
 }
 
+Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
+  Value *Res;
+  if (isSimpleValue()) {
+    Res = getSimpleValue();
+    if (Res->getType() != LoadTy) {
+      const DataLayout *TD = gvn.getDataLayout();
+      assert(TD && "Need target data to handle type mismatch case");
+      Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
+                                 *TD);
+  
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
+                   << *getSimpleValue() << '\n'
+                   << *Res << '\n' << "\n\n\n");
+    }
+  } else if (isCoercedLoadValue()) {
+    LoadInst *Load = getCoercedLoadValue();
+    if (Load->getType() == LoadTy && Offset == 0) {
+      Res = Load;
+    } else {
+      Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
+                                gvn);
+  
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
+                   << *getCoercedLoadValue() << '\n'
+                   << *Res << '\n' << "\n\n\n");
+    }
+  } else {
+    const DataLayout *TD = gvn.getDataLayout();
+    assert(TD && "Need target data to handle type mismatch case");
+    Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
+                                 LoadTy, BB->getTerminator(), *TD);
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+                 << "  " << *getMemIntrinValue() << '\n'
+                 << *Res << '\n' << "\n\n\n");
+  }
+  return Res;
+}
+
 static bool isLifetimeStart(const Instruction *Inst) {
   if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
     return II->getIntrinsicID() == Intrinsic::lifetime_start;
   return false;
 }
 
-/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
-/// non-local by performing PHI construction.
-bool GVN::processNonLocalLoad(LoadInst *LI) {
-  // Find the non-local dependencies of the load.
-  SmallVector<NonLocalDepResult, 64> Deps;
-  AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI);
-  MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps);
-  //DEBUG(dbgs() << "INVESTIGATING NONLOCAL LOAD: "
-  //             << Deps.size() << *LI << '\n');
-
-  // If we had to process more than one hundred blocks to find the
-  // dependencies, this load isn't worth worrying about.  Optimizing
-  // it will be too expensive.
-  unsigned NumDeps = Deps.size();
-  if (NumDeps > 100)
-    return false;
-
-  // If we had a phi translation failure, we'll have a single entry which is a
-  // clobber in the current block.  Reject this early.
-  if (NumDeps == 1 &&
-      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
-    DEBUG(
-      dbgs() << "GVN: non-local load ";
-      WriteAsOperand(dbgs(), LI);
-      dbgs() << " has unknown dependencies\n";
-    );
-    return false;
-  }
+void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
+                                  AvailValInBlkVect &ValuesPerBlock,
+                                  UnavailBlkVect &UnavailableBlocks) {
 
   // Filter out useless results (non-locals, etc).  Keep track of the blocks
   // where we have a value available in repl, also keep track of whether we see
   // dependencies that produce an unknown value for the load (such as a call
   // that could potentially clobber the load).
-  SmallVector<AvailableValueInBlock, 64> ValuesPerBlock;
-  SmallVector<BasicBlock*, 64> UnavailableBlocks;
-
+  unsigned NumDeps = Deps.size();
   for (unsigned i = 0, e = NumDeps; i != e; ++i) {
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
@@ -1480,35 +1465,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     }
 
     UnavailableBlocks.push_back(DepBB);
-    continue;
   }
+}
 
-  // If we have no predecessors that produce a known value for this load, exit
-  // early.
-  if (ValuesPerBlock.empty()) return false;
-
-  // If all of the instructions we depend on produce a known value for this
-  // load, then it is fully redundant and we can use PHI insertion to compute
-  // its value.  Insert PHIs and remove the fully redundant value now.
-  if (UnavailableBlocks.empty()) {
-    DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
-
-    // Perform PHI construction.
-    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
-    LI->replaceAllUsesWith(V);
-
-    if (isa<PHINode>(V))
-      V->takeName(LI);
-    if (V->getType()->getScalarType()->isPointerTy())
-      MD->invalidateCachedPointerInfo(V);
-    markInstructionForDeletion(LI);
-    ++NumGVNLoad;
-    return true;
-  }
-
-  if (!EnablePRE || !EnableLoadPRE)
-    return false;
-
+bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
+                         UnavailBlkVect &UnavailableBlocks) {
   // Okay, we have *some* definitions of the value.  This means that the value
   // is available in some of our (transitive) predecessors.  Lets think about
   // doing PRE of this load.  This will involve inserting a new load into the
@@ -1526,7 +1487,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
 
-  bool allSingleSucc = true;
   while (TmpBB->getSinglePredecessor()) {
     TmpBB = TmpBB->getSinglePredecessor();
     if (TmpBB == LoadBB) // Infinite (unreachable) loop.
@@ -1555,7 +1515,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
     FullyAvailableBlocks[UnavailableBlocks[i]] = false;
 
-  SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit;
+  SmallVector<BasicBlock *, 4> CriticalEdgePred;
   for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
        PI != E; ++PI) {
     BasicBlock *Pred = *PI;
@@ -1578,20 +1538,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
         return false;
       }
 
-      unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB);
-      NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum));
+      CriticalEdgePred.push_back(Pred);
     }
   }
 
-  if (!NeedToSplit.empty()) {
-    toSplit.append(NeedToSplit.begin(), NeedToSplit.end());
-    return false;
-  }
-
   // Decide whether PRE is profitable for this load.
   unsigned NumUnavailablePreds = PredLoads.size();
   assert(NumUnavailablePreds != 0 &&
-         "Fully available value should be eliminated above!");
+         "Fully available value should already be eliminated!");
 
   // If this load is unavailable in multiple predecessors, reject it.
   // FIXME: If we could restructure the CFG, we could make a common pred with
@@ -1600,6 +1554,17 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   if (NumUnavailablePreds != 1)
       return false;
 
+  // Split critical edges, and update the unavailable predecessors accordingly.
+  for (SmallVector<BasicBlock *, 4>::iterator I = CriticalEdgePred.begin(), 
+         E = CriticalEdgePred.end(); I != E; I++) {
+    BasicBlock *OrigPred = *I;
+    BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
+    PredLoads.erase(OrigPred);
+    PredLoads[NewPred] = 0;
+    DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+                 << LoadBB->getName() << '\n');
+  }
+
   // Check if the load can safely be moved to all the unavailable predecessors.
   bool CanDoPRE = true;
   SmallVector<Instruction*, 8> NewInsts;
@@ -1615,13 +1580,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     // pointer if it is not available.
     PHITransAddr Address(LI->getPointerOperand(), TD);
     Value *LoadPtr = 0;
-    if (allSingleSucc) {
-      LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
-                                                  *DT, NewInsts);
-    } else {
-      Address.PHITranslateValue(LoadBB, UnavailablePred, DT);
-      LoadPtr = Address.getAddr();
-    }
+    LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
+                                                *DT, NewInsts);
 
     // If we couldn't find or insert a computation of this phi translated value,
     // we fail PRE.
@@ -1632,24 +1592,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       break;
     }
 
-    // Make sure it is valid to move this load here.  We have to watch out for:
-    //  @1 = getelementptr (i8* p, ...
-    //  test p and branch if == 0
-    //  load @1
-    // It is valid to have the getelementptr before the test, even if p can
-    // be 0, as getelementptr only does address arithmetic.
-    // If we are not pushing the value through any multiple-successor blocks
-    // we do not have this case.  Otherwise, check that the load is safe to
-    // put anywhere; this can be improved, but should be conservatively safe.
-    if (!allSingleSucc &&
-        // FIXME: REEVALUTE THIS.
-        !isSafeToLoadUnconditionally(LoadPtr,
-                                     UnavailablePred->getTerminator(),
-                                     LI->getAlignment(), TD)) {
-      CanDoPRE = false;
-      break;
-    }
-
     I->second = LoadPtr;
   }
 
@@ -1659,7 +1601,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       if (MD) MD->removeInstruction(I);
       I->eraseFromParent();
     }
-    return false;
+    // HINT:Don't revert the edge-splitting as following transformation may 
+    // also need to split these critial edges.
+    return !CriticalEdgePred.empty();
   }
 
   // Okay, we can eliminate this load by inserting a reload in the predecessor
@@ -1714,6 +1658,72 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   return true;
 }
 
+/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI) {
+  // Step 1: Find the non-local dependencies of the load.
+  LoadDepVect Deps;
+  AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI);
+  MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps);
+
+  // If we had to process more than one hundred blocks to find the
+  // dependencies, this load isn't worth worrying about.  Optimizing
+  // it will be too expensive.
+  unsigned NumDeps = Deps.size();
+  if (NumDeps > 100)
+    return false;
+
+  // If we had a phi translation failure, we'll have a single entry which is a
+  // clobber in the current block.  Reject this early.
+  if (NumDeps == 1 &&
+      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
+    DEBUG(
+      dbgs() << "GVN: non-local load ";
+      WriteAsOperand(dbgs(), LI);
+      dbgs() << " has unknown dependencies\n";
+    );
+    return false;
+  }
+
+  // Step 2: Analyze the availability of the load
+  AvailValInBlkVect ValuesPerBlock;
+  UnavailBlkVect UnavailableBlocks;
+  AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks);
+
+  // If we have no predecessors that produce a known value for this load, exit
+  // early.
+  if (ValuesPerBlock.empty())
+    return false;
+
+  // Step 3: Eliminate fully redundancy.
+  //
+  // If all of the instructions we depend on produce a known value for this
+  // load, then it is fully redundant and we can use PHI insertion to compute
+  // its value.  Insert PHIs and remove the fully redundant value now.
+  if (UnavailableBlocks.empty()) {
+    DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+
+    // Perform PHI construction.
+    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+    LI->replaceAllUsesWith(V);
+
+    if (isa<PHINode>(V))
+      V->takeName(LI);
+    if (V->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(V);
+    markInstructionForDeletion(LI);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  // Step 4: Eliminate partial redundancy.
+  if (!EnablePRE || !EnableLoadPRE)
+    return false;
+
+  return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
+}
+
+
 static void patchReplacementInstruction(Instruction *I, Value *Repl) {
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
@@ -2296,8 +2306,6 @@ bool GVN::runOnFunction(Function& F) {
   while (ShouldContinue) {
     DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
     ShouldContinue = iterateOnFunction(F);
-    if (splitCriticalEdges())
-      ShouldContinue = true;
     Changed |= ShouldContinue;
     ++Iteration;
   }
@@ -2309,6 +2317,7 @@ bool GVN::runOnFunction(Function& F) {
       Changed |= PREChanged;
     }
   }
+
   // FIXME: Should perform GVN again after PRE does something.  PRE can move
   // computations into blocks where they become fully redundant.  Note that
   // we can't do this until PRE's critical edge splitting updates memdep.
@@ -2542,6 +2551,15 @@ bool GVN::performPRE(Function &F) {
   return Changed;
 }
 
+/// Split the critical edge connecting the given two blocks, and return
+/// the block inserted to the critical edge.
+BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+  BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this);
+  if (MD)
+    MD->invalidateCachedPredecessors();
+  return BB;
+}
+
 /// splitCriticalEdges - Split critical edges found during the previous
 /// iteration that may enable further optimization.
 bool GVN::splitCriticalEdges() {
@@ -2568,9 +2586,18 @@ bool GVN::iterateOnFunction(Function &F) {
        RE = RPOT.end(); RI != RE; ++RI)
     Changed |= processBlock(*RI);
 #else
+  // Save the blocks this function have before transformation begins. GVN may
+  // split critical edge, and hence may invalidate the RPO/DT iterator.
+  //
+  std::vector<BasicBlock *> BBVect;
+  BBVect.reserve(256);
   for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
        DE = df_end(DT->getRootNode()); DI != DE; ++DI)
-    Changed |= processBlock(DI->getBlock());
+    BBVect.push_back(DI->getBlock());
+
+  for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
+       I != E; I++)
+    Changed |= processBlock(*I);
 #endif
 
   return Changed;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 8e76c78..df11e92 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -532,7 +532,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         // and varies predictably *inside* the loop.  Evaluate the value it
         // contains when the loop exits, if possible.
         const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
-        if (!SE->isLoopInvariant(ExitValue, L))
+        if (!SE->isLoopInvariant(ExitValue, L) || !isSafeToExpand(ExitValue))
           continue;
 
         // Computing the value outside of the loop brings no benefit if :
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index e98ae95..14c5655 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -56,8 +56,8 @@ namespace {
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
-    void simplifyLoopLatch(Loop *L);
-    bool rotateLoop(Loop *L);
+    bool simplifyLoopLatch(Loop *L);
+    bool rotateLoop(Loop *L, bool SimplifiedLatch);
 
   private:
     LoopInfo *LI;
@@ -84,13 +84,14 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Simplify the loop latch before attempting to rotate the header
   // upward. Rotation may not be needed if the loop tail can be folded into the
   // loop exit.
-  simplifyLoopLatch(L);
+  bool SimplifiedLatch = simplifyLoopLatch(L);
 
   // One loop can be rotated multiple times.
   bool MadeChange = false;
-  while (rotateLoop(L))
+  while (rotateLoop(L, SimplifiedLatch)) {
     MadeChange = true;
-
+    SimplifiedLatch = false;
+  }
   return MadeChange;
 }
 
@@ -212,25 +213,25 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
 /// canonical form so downstream passes can handle it.
 ///
 /// I don't believe this invalidates SCEV.
-void LoopRotate::simplifyLoopLatch(Loop *L) {
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
   BasicBlock *Latch = L->getLoopLatch();
   if (!Latch || Latch->hasAddressTaken())
-    return;
+    return false;
 
   BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
   if (!Jmp || !Jmp->isUnconditional())
-    return;
+    return false;
 
   BasicBlock *LastExit = Latch->getSinglePredecessor();
   if (!LastExit || !L->isLoopExiting(LastExit))
-    return;
+    return false;
 
   BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
   if (!BI)
-    return;
+    return false;
 
   if (!shouldSpeculateInstrs(Latch->begin(), Jmp))
-    return;
+    return false;
 
   DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
         << LastExit->getName() << "\n");
@@ -253,10 +254,20 @@ void LoopRotate::simplifyLoopLatch(Loop *L) {
   if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
     DT->eraseNode(Latch);
   Latch->eraseFromParent();
+  return true;
 }
 
 /// Rotate loop LP. Return true if the loop is rotated.
-bool LoopRotate::rotateLoop(Loop *L) {
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
@@ -276,7 +287,12 @@ bool LoopRotate::rotateLoop(Loop *L) {
 
   // If the loop latch already contains a branch that leaves the loop then the
   // loop is already rotated.
-  if (OrigLatch == 0 || L->isLoopExiting(OrigLatch))
+  if (OrigLatch == 0)
+    return false;
+
+  // Rotate if either the loop latch does *not* exit the loop, or if the loop
+  // latch was just simplified.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
     return false;
 
   // Check size of original header and reject loop if it is very big or we can't
@@ -505,4 +521,3 @@ bool LoopRotate::rotateLoop(Loop *L) {
   ++NumRotated;
   return true;
 }
-
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 73e44d7..b107fef 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -774,6 +774,16 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
 }
 
 namespace {
+class LSRUse;
+}
+// Check if it is legal to fold 2 base registers.
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+                             const Formula &F);
+// Get the cost of the scaling factor used in F for LU.
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F);
+
+namespace {
 
 /// Cost - This class is used to measure and compare candidate formulae.
 class Cost {
@@ -785,11 +795,12 @@ class Cost {
   unsigned NumBaseAdds;
   unsigned ImmCost;
   unsigned SetupCost;
+  unsigned ScaleCost;
 
 public:
   Cost()
     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0) {}
+      SetupCost(0), ScaleCost(0) {}
 
   bool operator<(const Cost &Other) const;
 
@@ -799,9 +810,9 @@ public:
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
-             | ImmCost | SetupCost) != ~0u)
+             | ImmCost | SetupCost | ScaleCost) != ~0u)
       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
-           & ImmCost & SetupCost) == ~0u);
+           & ImmCost & SetupCost & ScaleCost) == ~0u);
   }
 #endif
 
@@ -810,12 +821,14 @@ public:
     return NumRegs == ~0u;
   }
 
-  void RateFormula(const Formula &F,
+  void RateFormula(const TargetTransformInfo &TTI,
+                   const Formula &F,
                    SmallPtrSet<const SCEV *, 16> &Regs,
                    const DenseSet<const SCEV *> &VisitedRegs,
                    const Loop *L,
                    const SmallVectorImpl<int64_t> &Offsets,
                    ScalarEvolution &SE, DominatorTree &DT,
+                   const LSRUse &LU,
                    SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
 
   void print(raw_ostream &OS) const;
@@ -900,12 +913,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
   }
 }
 
-void Cost::RateFormula(const Formula &F,
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+                       const Formula &F,
                        SmallPtrSet<const SCEV *, 16> &Regs,
                        const DenseSet<const SCEV *> &VisitedRegs,
                        const Loop *L,
                        const SmallVectorImpl<int64_t> &Offsets,
                        ScalarEvolution &SE, DominatorTree &DT,
+                       const LSRUse &LU,
                        SmallPtrSet<const SCEV *, 16> *LoserRegs) {
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
@@ -932,7 +947,12 @@ void Cost::RateFormula(const Formula &F,
   // Determine how many (unfolded) adds we'll need inside the loop.
   size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
   if (NumBaseParts > 1)
-    NumBaseAdds += NumBaseParts - 1;
+    // Do not count the base and a possible second register if the target
+    // allows to fold 2 registers.
+    NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
+
+  // Accumulate non-free scaling amounts.
+  ScaleCost += getScalingFactorCost(TTI, LU, F);
 
   // Tally up the non-zero immediates.
   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
@@ -955,6 +975,7 @@ void Cost::Loose() {
   NumBaseAdds = ~0u;
   ImmCost = ~0u;
   SetupCost = ~0u;
+  ScaleCost = ~0u;
 }
 
 /// operator< - Choose the lower cost.
@@ -967,6 +988,8 @@ bool Cost::operator<(const Cost &Other) const {
     return NumIVMuls < Other.NumIVMuls;
   if (NumBaseAdds != Other.NumBaseAdds)
     return NumBaseAdds < Other.NumBaseAdds;
+  if (ScaleCost != Other.ScaleCost)
+    return ScaleCost < Other.ScaleCost;
   if (ImmCost != Other.ImmCost)
     return ImmCost < Other.ImmCost;
   if (SetupCost != Other.SetupCost)
@@ -983,6 +1006,8 @@ void Cost::print(raw_ostream &OS) const {
   if (NumBaseAdds != 0)
     OS << ", plus " << NumBaseAdds << " base add"
        << (NumBaseAdds == 1 ? "" : "s");
+  if (ScaleCost != 0)
+    OS << ", plus " << ScaleCost << " scale cost";
   if (ImmCost != 0)
     OS << ", plus " << ImmCost << " imm cost";
   if (SetupCost != 0)
@@ -1359,6 +1384,58 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
                     F.BaseOffset, F.HasBaseReg, F.Scale);
 }
 
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+                             const Formula &F) {
+  // If F is used as an Addressing Mode, it may fold one Base plus one
+  // scaled register. If the scaled register is nil, do as if another
+  // element of the base regs is a 1-scaled register.
+  // This is possible if BaseRegs has at least 2 registers.
+
+  // If this is not an address calculation, this is not an addressing mode
+  // use.
+  if (LU.Kind !=  LSRUse::Address)
+    return false;
+
+  // F is already scaled.
+  if (F.Scale != 0)
+    return false;
+
+  // We need to keep one register for the base and one to scale.
+  if (F.BaseRegs.size() < 2)
+    return false;
+
+  return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                    F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
+ }
+
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F) {
+  if (!F.Scale)
+    return 0;
+  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                    LU.AccessTy, F) && "Illegal formula in use.");
+
+  switch (LU.Kind) {
+  case LSRUse::Address: {
+    int CurScaleCost = TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
+                                                F.BaseOffset, F.HasBaseReg,
+                                                F.Scale);
+    assert(CurScaleCost >= 0 && "Legal addressing mode has an illegal cost!");
+    return CurScaleCost;
+  }
+  case LSRUse::ICmpZero:
+    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg.
+    // Therefore, return 0 in case F.Scale == -1. 
+    return F.Scale != -1;
+
+  case LSRUse::Basic:
+  case LSRUse::Special:
+    return 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                              LSRUse::KindType Kind, Type *AccessTy,
                              GlobalValue *BaseGV, int64_t BaseOffset,
@@ -3607,7 +3684,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                    abs64(NewF.BaseOffset)) &&
                   (C->getValue()->getValue() +
                    NewF.BaseOffset).countTrailingZeros() >=
-                   CountTrailingZeros_64(NewF.BaseOffset))
+                   countTrailingZeros<uint64_t>(NewF.BaseOffset))
                 goto skip_formula;
 
           // Ok, looks good.
@@ -3690,7 +3767,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
       // the corresponding bad register from the Regs set.
       Cost CostF;
       Regs.clear();
-      CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
+      CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
                         &LoserRegs);
       if (CostF.isLoser()) {
         // During initial formula generation, undesirable formulae are generated
@@ -3726,7 +3803,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 
         Cost CostBest;
         Regs.clear();
-        CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
+        CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
+                             DT, LU);
         if (CostF < CostBest)
           std::swap(F, Best);
         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
@@ -4079,7 +4157,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     // the current best, prune the search at that point.
     NewCost = CurCost;
     NewRegs = CurRegs;
-    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
+    NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
+                        LU);
     if (NewCost < SolutionCost) {
       Workspace.push_back(&F);
       if (Workspace.size() != Uses.size()) {
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index be0f0e8..c325925 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -626,8 +626,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
       return false;
 
     Type *StructTy = cast<PointerType>(A->getType())->getElementType();
-    uint64_t destSize = TD->getTypeAllocSize(StructTy);
+    if (!StructTy->isSized()) {
+      // The call may never return and hence the copy-instruction may never
+      // be executed, and therefore it's not safe to say "the destination
+      // has at least <cpyLen> bytes, as implied by the copy-instruction",
+      return false;
+    }
 
+    uint64_t destSize = TD->getTypeAllocSize(StructTy);
     if (destSize < srcSize)
       return false;
   } else {
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index be8d39e..d105f5e 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -78,7 +78,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                              bool ModuleLevelChanges,
                              SmallVectorImpl<ReturnInst*> &Returns,
                              const char *NameSuffix, ClonedCodeInfo *CodeInfo,
-                             ValueMapTypeRemapper *TypeMapper) {
+                             ValueMapTypeRemapper *TypeMapper,
+                             ValueMaterializer *Materializer) {
   assert(NameSuffix && "NameSuffix cannot be null!");
 
 #ifndef NDEBUG
@@ -147,7 +148,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
       RemapInstruction(II, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                       TypeMapper);
+                       TypeMapper, Materializer);
 }
 
 /// CloneFunction - Return a copy of the specified function, but without
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 37819cc..6d5f16c 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -59,6 +59,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
 STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
@@ -100,16 +101,16 @@ namespace {
   private:
     bool ProcessLoop(Loop *L, LPPassManager &LPM);
     BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit);
-    BasicBlock *InsertPreheaderForLoop(Loop *L);
     Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM,
                              BasicBlock *Preheader);
     BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader);
-    void PlaceSplitBlockCarefully(BasicBlock *NewBB,
-                                  SmallVectorImpl<BasicBlock*> &SplitPreds,
-                                  Loop *L);
   };
 }
 
+static void PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                                     SmallVectorImpl<BasicBlock*> &SplitPreds,
+                                     Loop *L);
+
 char LoopSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
                 "Canonicalize natural loops", true, false)
@@ -208,7 +209,7 @@ ReprocessLoop:
   // Does the loop already have a preheader?  If so, don't insert one.
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
-    Preheader = InsertPreheaderForLoop(L);
+    Preheader = InsertPreheaderForLoop(L, this);
     if (Preheader) {
       ++NumInserted;
       Changed = true;
@@ -367,7 +368,7 @@ ReprocessLoop:
 /// preheader, this method is called to insert one.  This method has two phases:
 /// preheader insertion and analysis updating.
 ///
-BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
+BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {
   BasicBlock *Header = L->getHeader();
 
   // Compute the set of predecessors of the loop that are not in the loop.
@@ -390,11 +391,11 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
   BasicBlock *PreheaderBB;
   if (!Header->isLandingPad()) {
     PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader",
-                                         this);
+                                         PP);
   } else {
     SmallVector<BasicBlock*, 2> NewBBs;
     SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader",
-                                ".split-lp", this, NewBBs);
+                                ".split-lp", PP, NewBBs);
     PreheaderBB = NewBBs[0];
   }
 
@@ -491,9 +492,9 @@ static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT,
 // PlaceSplitBlockCarefully - If the block isn't already, move the new block to
 // right after some 'outside block' block.  This prevents the preheader from
 // being placed inside the loop body, e.g. when the loop hasn't been rotated.
-void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB,
-                                       SmallVectorImpl<BasicBlock*> &SplitPreds,
-                                            Loop *L) {
+void PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                              SmallVectorImpl<BasicBlock*> &SplitPreds,
+                              Loop *L) {
   // Check to see if NewBB is already well placed.
   Function::iterator BBI = NewBB; --BBI;
   for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 052ad85..6d12f7a 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -281,7 +281,7 @@ static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
   return BI->getCondition();
 }
 
-/// ComputeSpeculuationCost - Compute an abstract "cost" of speculating the
+/// ComputeSpeculationCost - Compute an abstract "cost" of speculating the
 /// given instruction, which is assumed to be safe to speculate. 1 means
 /// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive.
 static unsigned ComputeSpeculationCost(const User *I) {
@@ -533,9 +533,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
     if (BI->isConditional() && BI->getCondition()->hasOneUse())
       if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition()))
-        if ((ICI->getPredicate() == ICmpInst::ICMP_EQ ||
-             ICI->getPredicate() == ICmpInst::ICMP_NE) &&
-            GetConstantInt(ICI->getOperand(1), TD))
+        if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), TD))
           CV = ICI->getOperand(0);
 
   // Unwrap any lossless ptrtoint cast.
@@ -1083,9 +1081,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) {
       (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)))
     return false;
 
-  // If we get here, we can hoist at least one instruction.
   BasicBlock *BIParent = BI->getParent();
 
+  bool Changed = false;
   do {
     // If we are hoisting the terminator instruction, don't move one (making a
     // broken BB), instead clone it, and remove BI.
@@ -1100,6 +1098,7 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) {
       I2->replaceAllUsesWith(I1);
     I1->intersectOptionalDataWith(I2);
     I2->eraseFromParent();
+    Changed = true;
 
     I1 = BB1_Itr++;
     I2 = BB2_Itr++;
@@ -1119,7 +1118,23 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) {
 HoistTerminator:
   // It may not be possible to hoist an invoke.
   if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
-    return true;
+    return Changed;
+
+  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+    PHINode *PN;
+    for (BasicBlock::iterator BBI = SI->begin();
+         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      if (BB1V == BB2V)
+        continue;
+
+      if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
+        return Changed;
+      if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V))
+        return Changed;
+    }
+  }
 
   // Okay, it is safe to hoist the terminator.
   Instruction *NT = I1->clone();
@@ -1362,8 +1377,8 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
 ///
 /// \return The pointer to the value of the previous store if the store can be
 ///         hoisted into the predecessor block. 0 otherwise.
-Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
-                              BasicBlock *StoreBB, BasicBlock *EndBB) {
+static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
+                                     BasicBlock *StoreBB, BasicBlock *EndBB) {
   StoreInst *StoreToHoist = dyn_cast<StoreInst>(I);
   if (!StoreToHoist)
     return 0;
@@ -1522,18 +1537,23 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
     Value *OrigV = PN->getIncomingValueForBlock(BB);
     Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
 
+    // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
     // Skip PHIs which are trivial.
     if (ThenV == OrigV)
       continue;
 
     HaveRewritablePHIs = true;
-    ConstantExpr *CE = dyn_cast<ConstantExpr>(ThenV);
-    if (!CE)
+    ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV);
+    ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV);
+    if (!OrigCE && !ThenCE)
       continue; // Known safe and cheap.
 
-    if (!isSafeToSpeculativelyExecute(CE))
+    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) ||
+        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE)))
       return false;
-    if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
+    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE) : 0;
+    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE) : 0;
+    if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold)
       return false;
 
     // Account for the cost of an unfolded ConstantExpr which could end up
@@ -3643,7 +3663,7 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD,
 }
 
 /// ShouldBuildLookupTable - Determine whether a lookup table should be built
-/// for this switch, based on the number of caes, size of the table and the
+/// for this switch, based on the number of cases, size of the table and the
 /// types of the results.
 static bool ShouldBuildLookupTable(SwitchInst *SI,
                                    uint64_t TableSize,
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index b5941bd..457fc80 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -22,14 +22,22 @@ using namespace llvm;
 
 // Out of line method to get vtable etc for class.
 void ValueMapTypeRemapper::anchor() {}
+void ValueMaterializer::anchor() {}
 
 Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
-                      ValueMapTypeRemapper *TypeMapper) {
+                      ValueMapTypeRemapper *TypeMapper,
+                      ValueMaterializer *Materializer) {
   ValueToValueMapTy::iterator I = VM.find(V);
   
   // If the value already exists in the map, use it.
   if (I != VM.end() && I->second) return I->second;
   
+  // If we have a materializer and it can materialize a value, use that.
+  if (Materializer) {
+    if (Value *NewV = Materializer->materializeValueFor(const_cast<Value*>(V)))
+      return VM[V] = NewV;
+  }
+
   // Global values do not need to be seeded into the VM if they
   // are using the identity mapping.
   if (isa<GlobalValue>(V) || isa<MDString>(V))
@@ -57,14 +65,14 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
       return VM[V] = const_cast<Value*>(V);
     
     // Create a dummy node in case we have a metadata cycle.
-    MDNode *Dummy = MDNode::getTemporary(V->getContext(), ArrayRef<Value*>());
+    MDNode *Dummy = MDNode::getTemporary(V->getContext(), None);
     VM[V] = Dummy;
     
     // Check all operands to see if any need to be remapped.
     for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
       Value *OP = MD->getOperand(i);
       if (OP == 0) continue;
-      Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper);
+      Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper, Materializer);
       // Use identity map if Mapped_Op is null and we can ignore missing
       // entries.
       if (Mapped_OP == OP ||
@@ -79,7 +87,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
         if (Op == 0)
           Elts.push_back(0);
         else {
-          Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper);
+          Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper, Materializer);
           // Use identity map if Mapped_Op is null and we can ignore missing
           // entries.
           if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries))
@@ -109,9 +117,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   
   if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
     Function *F = 
-      cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper));
+      cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper, Materializer));
     BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(), VM,
-                                                       Flags, TypeMapper));
+                                                       Flags, TypeMapper, Materializer));
     return VM[V] = BlockAddress::get(F, BB ? BB : BA->getBasicBlock());
   }
   
@@ -121,7 +129,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   Value *Mapped = 0;
   for (; OpNo != NumOperands; ++OpNo) {
     Value *Op = C->getOperand(OpNo);
-    Mapped = MapValue(Op, VM, Flags, TypeMapper);
+    Mapped = MapValue(Op, VM, Flags, TypeMapper, Materializer);
     if (Mapped != C) break;
   }
   
@@ -149,7 +157,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     // Map the rest of the operands that aren't processed yet.
     for (++OpNo; OpNo != NumOperands; ++OpNo)
       Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM,
-                             Flags, TypeMapper));
+                             Flags, TypeMapper, Materializer));
   }
   
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
@@ -173,10 +181,11 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
 /// current values into those specified by VMap.
 ///
 void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
-                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper){
+                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
+                            ValueMaterializer *Materializer){
   // Remap operands.
   for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
-    Value *V = MapValue(*op, VMap, Flags, TypeMapper);
+    Value *V = MapValue(*op, VMap, Flags, TypeMapper, Materializer);
     // If we aren't ignoring missing entries, assert that something happened.
     if (V != 0)
       *op = V;
@@ -204,7 +213,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
   for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
        MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) {
     MDNode *Old = MI->second;
-    MDNode *New = MapValue(Old, VMap, Flags, TypeMapper);
+    MDNode *New = MapValue(Old, VMap, Flags, TypeMapper, Materializer);
     if (New != Old)
       I->setMetadata(MI->first, New);
   }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9a832f7..3693f4a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 //
 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
-// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizer uses (will use) the codegen
-// interfaces to generate IR that is likely to result in an optimal binary.
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
 //
 // The loop vectorizer combines consecutive loop iterations into a single
 // 'wide' iteration. After this transformation the index is incremented
@@ -80,6 +80,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -118,11 +119,11 @@ static const unsigned TinyTripCountUnrollThreshold = 128;
 /// than this number of comparisons.
 static const unsigned RuntimeMemoryCheckThreshold = 8;
 
-/// We use a metadata with this name  to indicate that a scalar loop was
-/// vectorized and that we don't need to re-vectorize it if we run into it
-/// again.
-static const char*
-AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized";
+/// Maximum simd width.
+static const unsigned MaxVectorWidth = 64;
+
+/// Maximum vectorization unroll count.
+static const unsigned MaxUnrollFactor = 16;
 
 namespace {
 
@@ -216,7 +217,7 @@ private:
   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
   /// The sequence starts at StartIndex.
-  Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
+  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
@@ -312,10 +313,85 @@ private:
   PHINode *Induction;
   /// The induction variable of the old basic block.
   PHINode *OldInduction;
+  /// Holds the extended (to the widest induction type) start index.
+  Value *ExtendedIdx;
   /// Maps scalars to widened vectors.
   ValueMap WidenMap;
 };
 
+/// \brief Check if conditionally executed loads are hoistable.
+///
+/// This class has two functions: isHoistableLoad and canHoistAllLoads.
+/// isHoistableLoad should be called on all load instructions that are executed
+/// conditionally. After all conditional loads are processed, the client should
+/// call canHoistAllLoads to determine if all of the conditional executed loads
+/// have an unconditional memory access to the same memory address in the loop.
+class LoadHoisting {
+  typedef SmallPtrSet<Value *, 8> MemorySet;
+
+  Loop *TheLoop;
+  DominatorTree *DT;
+  MemorySet CondLoadAddrSet;
+
+public:
+  LoadHoisting(Loop *L, DominatorTree *D) : TheLoop(L), DT(D) {}
+
+  /// \brief Check if the instruction is a load with a identifiable address.
+  bool isHoistableLoad(Instruction *L);
+
+  /// \brief Check if all of the conditional loads are hoistable because there
+  /// exists an unconditional memory access to the same address in the loop.
+  bool canHoistAllLoads();
+};
+
+bool LoadHoisting::isHoistableLoad(Instruction *L) {
+  LoadInst *LI = dyn_cast<LoadInst>(L);
+  if (!LI)
+    return false;
+
+  CondLoadAddrSet.insert(LI->getPointerOperand());
+  return true;
+}
+
+static void addMemAccesses(BasicBlock *BB, SmallPtrSet<Value *, 8> &Set) {
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(BI)) // Try a load.
+      Set.insert(LI->getPointerOperand());
+    else if (StoreInst *SI = dyn_cast<StoreInst>(BI)) // Try a store.
+      Set.insert(SI->getPointerOperand());
+  }
+}
+
+bool LoadHoisting::canHoistAllLoads() {
+  // No conditional loads.
+  if (CondLoadAddrSet.empty())
+    return true;
+
+  MemorySet UncondMemAccesses;
+  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
+  BasicBlock *LoopLatch = TheLoop->getLoopLatch();
+
+  // Iterate over the unconditional blocks and collect memory access addresses.
+  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
+    BasicBlock *BB = LoopBlocks[i];
+
+    // Ignore conditional blocks.
+    if (BB != LoopLatch && !DT->dominates(BB, LoopLatch))
+      continue;
+
+    addMemAccesses(BB, UncondMemAccesses);
+  }
+
+  // And make sure there is a matching unconditional access for every
+  // conditional load.
+  for (MemorySet::iterator MI = CondLoadAddrSet.begin(),
+       ME = CondLoadAddrSet.end(); MI != ME; ++MI)
+    if (!UncondMemAccesses.count(*MI))
+      return false;
+
+  return true;
+}
+
 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 /// to what vectorization factor.
 /// This class does not look at the profitability of vectorization, only the
@@ -335,7 +411,8 @@ public:
                             DominatorTree *DT, TargetTransformInfo* TTI,
                             AliasAnalysis *AA, TargetLibraryInfo *TLI)
       : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
-        Induction(0) {}
+        Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
+        LoadSpeculation(L, DT) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -347,7 +424,8 @@ public:
     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
     RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
     RK_FloatAdd,    ///< Sum of floats.
-    RK_FloatMult    ///< Product of floats.
+    RK_FloatMult,   ///< Product of floats.
+    RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
   };
 
   /// This enum represents the kinds of inductions that we support.
@@ -365,7 +443,9 @@ public:
     MRK_UIntMin,
     MRK_UIntMax,
     MRK_SIntMin,
-    MRK_SIntMax
+    MRK_SIntMax,
+    MRK_FloatMin,
+    MRK_FloatMax
   };
 
   /// This POD struct holds information about reduction variables.
@@ -379,7 +459,7 @@ public:
 
     // The starting value of the reduction.
     // It does not have to be zero!
-    Value *StartValue;
+    TrackingVH<Value> StartValue;
     // The instruction who's value is used outside the loop.
     Instruction *LoopExitInstr;
     // The kind of the reduction.
@@ -424,7 +504,7 @@ public:
     /// This flag indicates if we need to add the runtime check.
     bool Need;
     /// Holds the pointers that we need to check.
-    SmallVector<Value*, 2> Pointers;
+    SmallVector<TrackingVH<Value>, 2> Pointers;
     /// Holds the pointer value at the beginning of the loop.
     SmallVector<const SCEV*, 2> Starts;
     /// Holds the pointer value at the end of the loop.
@@ -438,7 +518,7 @@ public:
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
     InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
     /// Start value.
-    Value *StartValue;
+    TrackingVH<Value> StartValue;
     /// Induction kind.
     InductionKind IK;
   };
@@ -470,6 +550,9 @@ public:
   /// Returns the induction variables found in the loop.
   InductionList *getInductionVars() { return &Inductions; }
 
+  /// Returns the widest induction type.
+  Type *getWidestInductionType() { return WidestIndTy; }
+
   /// Returns True if V is an induction variable in this loop.
   bool isInductionVariable(const Value *V);
 
@@ -498,8 +581,7 @@ public:
 
   /// This function returns the identity element (or neutral element) for
   /// the operation K.
-  static Constant *getReductionIdentity(ReductionKind K, Type *Tp,
-                                        MinMaxReductionKind MinMaxK);
+  static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -577,6 +659,8 @@ private:
   /// Notice that inductions don't need to start at zero and that induction
   /// variables can be pointers.
   InductionList Inductions;
+  /// Holds the widest induction type encountered.
+  Type *WidestIndTy;
 
   /// Allowed outside users. This holds the reduction
   /// vars which can be accessed from outside the loop.
@@ -587,6 +671,11 @@ private:
   /// We need to check that all of the pointers in this list are disjoint
   /// at runtime.
   RuntimePointerCheck PtrRtCheck;
+  /// Can we assume the absence of NaNs.
+  bool HasFunNoNaNAttr;
+
+  /// Utility to determine whether loads can be speculated.
+  LoadHoisting LoadSpeculation;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -679,6 +768,126 @@ private:
   const TargetLibraryInfo *TLI;
 };
 
+/// Utility class for getting and setting loop vectorizer hints in the form
+/// of loop metadata.
+struct LoopVectorizeHints {
+  /// Vectorization width.
+  unsigned Width;
+  /// Vectorization unroll factor.
+  unsigned Unroll;
+
+  LoopVectorizeHints(const Loop *L)
+  : Width(VectorizationFactor)
+  , Unroll(VectorizationUnroll)
+  , LoopID(L->getLoopID()) {
+    getHints(L);
+    // The command line options override any loop metadata except for when
+    // width == 1 which is used to indicate the loop is already vectorized.
+    if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1)
+      Width = VectorizationFactor;
+    if (VectorizationUnroll.getNumOccurrences() > 0)
+      Unroll = VectorizationUnroll;
+  }
+
+  /// Return the loop vectorizer metadata prefix.
+  static StringRef Prefix() { return "llvm.vectorizer."; }
+
+  MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) {
+    SmallVector<Value*, 2> Vals;
+    Vals.push_back(MDString::get(Context, Name));
+    Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V));
+    return MDNode::get(Context, Vals);
+  }
+
+  /// Mark the loop L as already vectorized by setting the width to 1.
+  void setAlreadyVectorized(Loop *L) {
+    LLVMContext &Context = L->getHeader()->getContext();
+
+    Width = 1;
+
+    // Create a new loop id with one more operand for the already_vectorized
+    // hint. If the loop already has a loop id then copy the existing operands.
+    SmallVector<Value*, 4> Vals(1);
+    if (LoopID)
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i)
+        Vals.push_back(LoopID->getOperand(i));
+
+    Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
+
+    MDNode *NewLoopID = MDNode::get(Context, Vals);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+
+    L->setLoopID(NewLoopID);
+    if (LoopID)
+      LoopID->replaceAllUsesWith(NewLoopID);
+
+    LoopID = NewLoopID;
+  }
+
+private:
+  MDNode *LoopID;
+
+  /// Find hints specified in the loop metadata.
+  void getHints(const Loop *L) {
+    if (!LoopID)
+      return;
+
+    // First operand should refer to the loop id itself.
+    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      const MDString *S = 0;
+      SmallVector<Value*, 4> Args;
+
+      // The expected hint is either a MDString or a MDNode with the first
+      // operand a MDString.
+      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+        if (!MD || MD->getNumOperands() == 0)
+          continue;
+        S = dyn_cast<MDString>(MD->getOperand(0));
+        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+          Args.push_back(MD->getOperand(i));
+      } else {
+        S = dyn_cast<MDString>(LoopID->getOperand(i));
+        assert(Args.size() == 0 && "too many arguments for MDString");
+      }
+
+      if (!S)
+        continue;
+
+      // Check if the hint starts with the vectorizer prefix.
+      StringRef Hint = S->getString();
+      if (!Hint.startswith(Prefix()))
+        continue;
+      // Remove the prefix.
+      Hint = Hint.substr(Prefix().size(), StringRef::npos);
+
+      if (Args.size() == 1)
+        getHint(Hint, Args[0]);
+    }
+  }
+
+  // Check string hint with one operand.
+  void getHint(StringRef Hint, Value *Arg) {
+    const ConstantInt *C = dyn_cast<ConstantInt>(Arg);
+    if (!C) return;
+    unsigned Val = C->getZExtValue();
+
+    if (Hint == "width") {
+      assert(isPowerOf2_32(Val) && Val <= MaxVectorWidth &&
+             "Invalid width metadata");
+      Width = Val;
+    } else if (Hint == "unroll") {
+      assert(isPowerOf2_32(Val) && Val <= MaxUnrollFactor &&
+             "Invalid unroll metadata");
+      Unroll = Val;
+    } else
+      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint);
+  }
+};
+
 /// The LoopVectorize Pass.
 struct LoopVectorize : public LoopPass {
   /// Pass identification, replacement for typeid
@@ -717,6 +926,13 @@ struct LoopVectorize : public LoopPass {
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
+    LoopVectorizeHints Hints(L);
+
+    if (Hints.Width == 1) {
+      DEBUG(dbgs() << "LV: Not vectorizing.\n");
+      return false;
+    }
+
     // Check if it is legal to vectorize the loop.
     LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
     if (!LVL.canVectorize()) {
@@ -744,10 +960,10 @@ struct LoopVectorize : public LoopPass {
 
     // Select the optimal vectorization factor.
     LoopVectorizationCostModel::VectorizationFactor VF;
-    VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+    VF = CM.selectVectorizationFactor(OptForSize, Hints.Width);
     // Select the unroll factor.
-    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
-                                        VF.Width, VF.Cost);
+    unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
+                                        VF.Cost);
 
     if (VF.Width == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
@@ -762,6 +978,9 @@ struct LoopVectorize : public LoopPass {
     InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
     LB.vectorize(&LVL);
 
+    // Mark the loop as already vectorized to avoid vectorizing again.
+    Hints.setAlreadyVectorized(L);
+
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
     return true;
   }
@@ -794,7 +1013,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
   const SCEV *Sc = SE->getSCEV(Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
-  const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
+  const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
   const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
   Pointers.push_back(Ptr);
   Starts.push_back(AR->getStart());
@@ -825,7 +1044,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
                                                  bool Negate) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
   assert(Val->getType()->getScalarType()->isIntegerTy() &&
@@ -838,8 +1057,8 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
 
   // Create a vector of consecutive numbers from zero to VF.
   for (int i = 0; i < VLen; ++i) {
-    int Idx = Negate ? (-i): i;
-    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
+    int64_t Idx = Negate ? (-i) : i;
+    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
   }
 
   // Add the consecutive indices to the vector value.
@@ -1229,21 +1448,15 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
-  // Mark the old scalar loop with metadata that tells us not to vectorize this
-  // loop again if we run into it.
-  MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>());
-  OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
-
   // Some loops have a single integer induction variable, while other loops
   // don't. One example is c++ iterators that often have multiple pointer
   // induction variables. In the code below we also support a case where we
   // don't have a single induction variable.
   OldInduction = Legal->getInduction();
-  Type *IdxTy = OldInduction ? OldInduction->getType() :
-  DL->getIntPtrType(SE->getContext());
+  Type *IdxTy = Legal->getWidestInductionType();
 
   // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
   // Get the total trip count from the count by adding 1.
@@ -1261,9 +1474,11 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // The loop index does not have to start at Zero. Find the original start
   // value from the induction PHI node. If we don't have an induction variable
   // then we know that it starts at zero.
-  Value *StartIdx = OldInduction ?
-  OldInduction->getIncomingValueForBlock(BypassBlock):
-  ConstantInt::get(IdxTy, 0);
+  Builder.SetInsertPoint(BypassBlock->getTerminator());
+  Value *StartIdx = ExtendedIdx = OldInduction ?
+    Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
+                       IdxTy):
+    ConstantInt::get(IdxTy, 0);
 
   assert(BypassBlock && "Invalid loop structure");
   LoopBypassBlocks.push_back(BypassBlock);
@@ -1357,76 +1572,101 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   PHINode *ResumeIndex = 0;
   LoopVectorizationLegality::InductionList::iterator I, E;
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  // Set builder to point to last bypass block.
+  BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
   for (I = List->begin(), E = List->end(); I != E; ++I) {
     PHINode *OrigPhi = I->first;
     LoopVectorizationLegality::InductionInfo II = I->second;
-    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+
+    Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
+    PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
                                          MiddleBlock->getTerminator());
+    // We might have extended the type of the induction variable but we need a
+    // truncated version for the scalar loop.
+    PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
+      PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
+                      MiddleBlock->getTerminator()) : 0;
+
     Value *EndValue = 0;
     switch (II.IK) {
     case LoopVectorizationLegality::IK_NoInduction:
       llvm_unreachable("Unknown induction");
     case LoopVectorizationLegality::IK_IntInduction: {
-      // Handle the integer induction counter:
+      // Handle the integer induction counter.
       assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
-      assert(OrigPhi == OldInduction && "Unknown integer PHI");
-      // We know what the end value is.
-      EndValue = IdxEndRoundDown;
-      // We also know which PHI node holds it.
-      ResumeIndex = ResumeVal;
+
+      // We have the canonical induction variable.
+      if (OrigPhi == OldInduction) {
+        // Create a truncated version of the resume value for the scalar loop,
+        // we might have promoted the type to a larger width.
+        EndValue =
+          BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
+        // The new PHI merges the original incoming value, in case of a bypass,
+        // or the value at the end of the vectorized loop.
+        for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+          TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+        TruncResumeVal->addIncoming(EndValue, VecBody);
+
+        // We know what the end value is.
+        EndValue = IdxEndRoundDown;
+        // We also know which PHI node holds it.
+        ResumeIndex = ResumeVal;
+        break;
+      }
+
+      // Not the canonical induction variable - add the vector loop count to the
+      // start value.
+      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
+                                                   II.StartValue->getType(),
+                                                   "cast.crd");
+      EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
       break;
     }
     case LoopVectorizationLegality::IK_ReverseIntInduction: {
       // Convert the CountRoundDown variable to the PHI size.
-      unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
-      unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
-      Value *CRD = CountRoundDown;
-      if (CRDSize > IISize)
-        CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
-                               II.StartValue->getType(), "tr.crd",
-                               LoopBypassBlocks.back()->getTerminator());
-      else if (CRDSize < IISize)
-        CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
-                               II.StartValue->getType(),
-                               "sext.crd",
-                               LoopBypassBlocks.back()->getTerminator());
-      // Handle reverse integer induction counter:
-      EndValue =
-        BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
-                                  LoopBypassBlocks.back()->getTerminator());
+      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
+                                                   II.StartValue->getType(),
+                                                   "cast.crd");
+      // Handle reverse integer induction counter.
+      EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
       break;
     }
     case LoopVectorizationLegality::IK_PtrInduction: {
       // For pointer induction variables, calculate the offset using
       // the end index.
-      EndValue =
-        GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
-                                  LoopBypassBlocks.back()->getTerminator());
+      EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
+                                         "ptr.ind.end");
       break;
     }
     case LoopVectorizationLegality::IK_ReversePtrInduction: {
       // The value at the end of the loop for the reverse pointer is calculated
       // by creating a GEP with a negative index starting from the start value.
       Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
-      Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown,
-                                  "rev.ind.end",
-                                  LoopBypassBlocks.back()->getTerminator());
-      EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx,
-                                  "rev.ptr.ind.end",
-                                  LoopBypassBlocks.back()->getTerminator());
+      Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
+                                              "rev.ind.end");
+      EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
+                                         "rev.ptr.ind.end");
       break;
     }
     }// end of case
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
+      if (OrigPhi == OldInduction)
+        ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
+      else
+        ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+    }
     ResumeVal->addIncoming(EndValue, VecBody);
 
     // Fix the scalar body counter (PHI node).
     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
-    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+    // The old inductions phi node in the scalar body needs the truncated value.
+    if (OrigPhi == OldInduction)
+      OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
+    else
+      OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
   }
 
   // If we are generating a new induction variable then we also need to
@@ -1501,8 +1741,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
 Constant*
-LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp,
-                                                MinMaxReductionKind MinMaxK) {
+LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
   switch (K) {
   case RK_IntegerXor:
   case RK_IntegerAdd:
@@ -1521,24 +1760,6 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp,
   case  RK_FloatAdd:
     // Adding zero to a number does not change it.
     return ConstantFP::get(Tp, 0.0L);
-  case  RK_IntegerMinMax:
-    switch(MinMaxK) {
-    default: llvm_unreachable("Unknown min/max predicate");
-    case MRK_UIntMin:
-      return ConstantInt::getAllOnesValue(Tp);
-    case MRK_UIntMax:
-      return ConstantInt::get(Tp, 0);
-    case MRK_SIntMin: {
-      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
-      return ConstantInt::get(Tp->getContext(),
-                              APInt::getSignedMaxValue(BitWidth));
-    }
-    case LoopVectorizationLegality::MRK_SIntMax: {
-      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
-      return ConstantInt::get(Tp->getContext(),
-                              APInt::getSignedMinValue(BitWidth));
-    }
-    }
   default:
     llvm_unreachable("Unknown reduction kind");
   }
@@ -1668,6 +1889,8 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
       return Instruction::FAdd;
     case LoopVectorizationLegality::RK_IntegerMinMax:
       return Instruction::ICmp;
+    case LoopVectorizationLegality::RK_FloatMinMax:
+      return Instruction::FCmp;
     default:
       llvm_unreachable("Unknown reduction operation");
   }
@@ -1692,8 +1915,21 @@ Value *createMinMaxOp(IRBuilder<> &Builder,
     break;
   case LoopVectorizationLegality::MRK_SIntMax:
     P = CmpInst::ICMP_SGT;
+    break;
+  case LoopVectorizationLegality::MRK_FloatMin:
+    P = CmpInst::FCMP_OLT;
+    break;
+  case LoopVectorizationLegality::MRK_FloatMax:
+    P = CmpInst::FCMP_OGT;
+    break;
   }
-  Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
+  Value *Cmp;
+  if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax)
+    Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
+  else
+    Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
   return Select;
 }
@@ -1761,16 +1997,24 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Find the reduction identity variable. Zero for addition, or, xor,
     // one for multiplication, -1 for And.
-    Constant *Iden =
-      LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
-                                                      VecTy->getScalarType(),
-                                                      RdxDesc.MinMaxKind);
-    Constant *Identity = ConstantVector::getSplat(VF, Iden);
-
-    // This vector is the Identity vector where the first element is the
-    // incoming scalar reduction.
-    Value *VectorStart = Builder.CreateInsertElement(Identity,
-                                                     RdxDesc.StartValue, Zero);
+    Value *Identity;
+    Value *VectorStart;
+    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
+        RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
+      // MinMax reduction have the start value as their identify.
+      VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
+                                                         "minmax.ident");
+    } else {
+      Constant *Iden =
+        LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+                                                        VecTy->getScalarType());
+      Identity = ConstantVector::getSplat(VF, Iden);
+
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart = Builder.CreateInsertElement(Identity,
+                                                RdxDesc.StartValue, Zero);
+    }
 
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
@@ -1784,7 +2028,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
     VectorParts &Val = getVectorValue(LoopVal);
     for (unsigned part = 0; part < UF; ++part) {
-      // Make sure to add the reduction stat value only to the 
+      // Make sure to add the reduction stat value only to the
       // first unroll part.
       Value *StartVal = (part == 0) ? VectorStart : Identity;
       cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
@@ -1814,7 +2058,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *ReducedPartRdx = RdxParts[0];
     unsigned Op = getReductionBinOp(RdxDesc.Kind);
     for (unsigned part = 1; part < UF; ++part) {
-      if (Op != Instruction::ICmp)
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
         ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
                                              RdxParts[part], ReducedPartRdx,
                                              "bin.rdx");
@@ -1845,7 +2089,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      if (Op != Instruction::ICmp)
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
         TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
                                      "bin.rdx");
       else
@@ -1982,18 +2226,33 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // We know that all PHIs in non header blocks are converted into
         // selects, so we don't have to worry about the insertion order and we
         // can just use the builder.
-
         // At this point we generate the predication tree. There may be
         // duplications since this is a simple recursive scan, but future
         // optimizations will clean it up.
-        VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
-                                               P->getParent());
 
-        for (unsigned part = 0; part < UF; ++part) {
-        VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
-        VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part],
-                                             "predphi");
+        unsigned NumIncoming = P->getNumIncomingValues();
+
+        // Generate a sequence of selects of the form:
+        // SELECT(Mask3, In3,
+        //      SELECT(Mask2, In2,
+        //                   ( ...)))
+        for (unsigned In = 0; In < NumIncoming; In++) {
+          VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
+                                            P->getParent());
+          VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+
+          for (unsigned part = 0; part < UF; ++part) {
+            // We might have single edge PHIs (blocks) - use an identity
+            // 'select' for the first PHI operand.
+            if (In == 0)
+              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                                 In0[part]);
+            else
+              // Select between the current value and the previous incoming edge
+              // based on the incoming mask.
+              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                                 Entry[part], "predphi");
+          }
         }
         continue;
       }
@@ -2010,10 +2269,25 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       case LoopVectorizationLegality::IK_NoInduction:
         llvm_unreachable("Unknown induction");
       case LoopVectorizationLegality::IK_IntInduction: {
-        assert(P == OldInduction && "Unexpected PHI");
-        Value *Broadcasted = getBroadcastInstrs(Induction);
-        // After broadcasting the induction variable we need to make the
-        // vector consecutive by adding 0, 1, 2 ...
+        assert(P->getType() == II.StartValue->getType() && "Types must match");
+        Type *PhiTy = P->getType();
+        Value *Broadcasted;
+        if (P == OldInduction) {
+          // Handle the canonical induction variable. We might have had to
+          // extend the type.
+          Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
+        } else {
+          // Handle other induction variables that are now based on the
+          // canonical one.
+          Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
+                                                   "normalized.idx");
+          NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
+          Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
+                                          "offset.idx");
+        }
+        Broadcasted = getBroadcastInstrs(Broadcasted);
+        // After broadcasting the induction variable we need to make the vector
+        // consecutive by adding 0, 1, 2, etc.
         for (unsigned part = 0; part < UF; ++part)
           Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
         continue;
@@ -2022,16 +2296,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       case LoopVectorizationLegality::IK_PtrInduction:
       case LoopVectorizationLegality::IK_ReversePtrInduction:
         // Handle reverse integer and pointer inductions.
-        Value *StartIdx = 0;
-        // If we have a single integer induction variable then use it.
-        // Otherwise, start counting at zero.
-        if (OldInduction) {
-          LoopVectorizationLegality::InductionInfo OldII =
-            Legal->getInductionVars()->lookup(OldInduction);
-          StartIdx = OldII.StartValue;
-        } else {
-          StartIdx = ConstantInt::get(Induction->getType(), 0);
-        }
+        Value *StartIdx = ExtendedIdx;
         // This is the normalized GEP that starts counting at zero.
         Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
                                                  "normalized.idx");
@@ -2049,7 +2314,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
           // After broadcasting the induction variable we need to make the
           // vector consecutive by adding  ... -3, -2, -1, 0.
           for (unsigned part = 0; part < UF; ++part)
-            Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true);
+            Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
+                                               true);
           continue;
         }
 
@@ -2273,23 +2539,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     if (!isa<BranchInst>(BB->getTerminator()))
       return false;
 
-    // We must have at most two predecessors because we need to convert
-    // all PHIs to selects.
-    unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
-    if (Preds > 2)
-      return false;
-
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
       return false;
   }
 
+  // Check that we can actually speculate the hoistable loads.
+  if (!LoadSpeculation.canHoistAllLoads())
+    return false;
+
   // We can if-convert this loop.
   return true;
 }
 
 bool LoopVectorizationLegality::canVectorize() {
-  assert(TheLoop->getLoopPreheader() && "No preheader!!");
+  // We must have a loop in canonical form. Loops with indirectbr in them cannot
+  // be canonicalized.
+  if (!TheLoop->getLoopPreheader())
+    return false;
 
   // We can only vectorize innermost loops.
   if (TheLoop->getSubLoopsVector().size())
@@ -2317,7 +2584,7 @@ bool LoopVectorizationLegality::canVectorize() {
         TheLoop->getHeader()->getName() << "\n");
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
   if (ExitCount == SE->getCouldNotCompute()) {
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
@@ -2356,16 +2623,50 @@ bool LoopVectorizationLegality::canVectorize() {
   return true;
 }
 
+static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
+  if (Ty->isPointerTy())
+    return DL.getIntPtrType(Ty->getContext());
+  return Ty;
+}
+
+static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
+  Ty0 = convertPointerToIntegerType(DL, Ty0);
+  Ty1 = convertPointerToIntegerType(DL, Ty1);
+  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+    return Ty0;
+  return Ty1;
+}
+
+/// \brief Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+                               SmallPtrSet<Value *, 4> &Reductions) {
+  // Reduction instructions are allowed to have exit users. All other
+  // instructions must not have external users.
+  if (!Reductions.count(Inst))
+    //Check that all of the users of the loop are inside the BB.
+    for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+         I != E; ++I) {
+      Instruction *U = cast<Instruction>(*I);
+      // This user may be a reduction exit value.
+      if (!TheLoop->contains(U)) {
+        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        return true;
+      }
+    }
+  return false;
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
   BasicBlock *Header = TheLoop->getHeader();
 
-  // If we marked the scalar loop as "already vectorized" then no need
-  // to vectorize it again.
-  if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) {
-    DEBUG(dbgs() << "LV: This loop was vectorized before\n");
-    return false;
-  }
+  // Look for the attribute signaling the absence of NaNs.
+  Function &F = *Header->getParent();
+  if (F.hasFnAttribute("no-nans-fp-math"))
+    HasFunNoNaNAttr = F.getAttributes().getAttribute(
+      AttributeSet::FunctionIndex,
+      "no-nans-fp-math").getValueAsString() == "true";
 
   // For each block in the loop.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
@@ -2376,16 +2677,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
          ++it) {
 
       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
-        // This should not happen because the loop should be normalized.
-        if (Phi->getNumIncomingValues() != 2) {
-          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
-          return false;
-        }
-
+        Type *PhiTy = Phi->getType();
         // Check that this PHI type is allowed.
-        if (!Phi->getType()->isIntegerTy() &&
-            !Phi->getType()->isFloatingPointTy() &&
-            !Phi->getType()->isPointerTy()) {
+        if (!PhiTy->isIntegerTy() &&
+            !PhiTy->isFloatingPointTy() &&
+            !PhiTy->isPointerTy()) {
           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
         }
@@ -2393,8 +2689,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // If this PHINode is not in the header block, then we know that we
         // can convert it to select during if-conversion. No need to check if
         // the PHIs in this block are induction or reduction variables.
-        if (*bb != Header)
-          continue;
+        if (*bb != Header) {
+          // Check that this instruction has no outside users or is an
+          // identified reduction value with an outside user.
+          if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+            continue;
+          return false;
+        }
+
+        // We only allow if-converted PHIs with more than two incoming values.
+        if (Phi->getNumIncomingValues() != 2) {
+          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+          return false;
+        }
 
         // This is the value coming from the preheader.
         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
@@ -2402,13 +2709,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         InductionKind IK = isInductionVariable(Phi);
 
         if (IK_NoInduction != IK) {
+          // Get the widest type.
+          if (!WidestIndTy)
+            WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
+          else
+            WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
+
           // Int inductions are special because we only allow one IV.
           if (IK == IK_IntInduction) {
-            if (Induction) {
-              DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
-              return false;
-            }
-            Induction = Phi;
+            // Use the phi node with the widest type as induction. Use the last
+            // one if there are multiple (no good reason for doing this other
+            // than it is expedient).
+            if (!Induction || PhiTy == WidestIndTy)
+              Induction = Phi;
           }
 
           DEBUG(dbgs() << "LV: Found an induction variable.\n");
@@ -2448,6 +2761,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
           continue;
         }
+        if (AddReductionVar(Phi, RK_FloatMinMax)) {
+          DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
 
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
         return false;
@@ -2477,24 +2794,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
-      if (!AllowedExit.count(it))
-        //Check that all of the users of the loop are inside the BB.
-        for (Value::use_iterator I = it->use_begin(), E = it->use_end();
-             I != E; ++I) {
-          Instruction *U = cast<Instruction>(*I);
-          // This user may be a reduction exit value.
-          if (!TheLoop->contains(U)) {
-            DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
-            return false;
-          }
-        }
+      if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+        return false;
+
     } // next instr.
 
   }
 
   if (!Induction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
-    assert(getInductionVars()->size() && "No induction variables");
+    if (Inductions.empty())
+      return false;
   }
 
   return true;
@@ -2523,9 +2833,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
     Uniforms.insert(I);
 
     // Insert all operands.
-    for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
-      Worklist.push_back(I->getOperand(i));
-    }
+    Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
   }
 }
 
@@ -2776,8 +3084,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
                                         Inst,
                                         WriteObjects,
                                         MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << *UI <<"\n");
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI
+                     << "\n");
         return false;
       }
 
@@ -2820,8 +3128,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
                                         Inst,
                                         WriteObjects,
                                         MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
-              << *UI <<"\n");
+        DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI
+                     << "\n");
         return false;
       }
     }
@@ -2841,6 +3149,26 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   return true;
 }
 
+static bool hasMultipleUsesOf(Instruction *I,
+                              SmallPtrSet<Instruction *, 8> &Insts) {
+  unsigned NumUses = 0;
+  for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
+    if (Insts.count(dyn_cast<Instruction>(*Use)))
+      ++NumUses;
+    if (NumUses > 1)
+      return true;
+  }
+
+  return false;
+}
+
+static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) {
+  for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
+    if (!Set.count(dyn_cast<Instruction>(*Use)))
+      return false;
+  return true;
+}
+
 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
                                                 ReductionKind Kind) {
   if (Phi->getNumIncomingValues() != 2)
@@ -2859,129 +3187,160 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // This includes users of the reduction, variables (which form a cycle
   // which ends in the phi node).
   Instruction *ExitInstruction = 0;
-  // Indicates that we found a binary operation in our scan.
-  bool FoundBinOp = false;
+  // Indicates that we found a reduction operation in our scan.
+  bool FoundReduxOp = false;
 
-  // Iter is our iterator. We start with the PHI node and scan for all of the
-  // users of this instruction. All users must be instructions that can be
-  // used as reduction variables (such as ADD). We may have a single
-  // out-of-block user. The cycle must end with the original PHI.
-  Instruction *Iter = Phi;
+  // We start with the PHI node and scan for all of the users of this
+  // instruction. All users must be instructions that can be used as reduction
+  // variables (such as ADD). We must have a single out-of-block user. The cycle
+  // must include the original PHI.
+  bool FoundStartPHI = false;
 
   // To recognize min/max patterns formed by a icmp select sequence, we store
   // the number of instruction we saw from the recognized min/max pattern,
-  // such that we don't stop when we see the phi has two uses (one by the select
-  // and one by the icmp) and to make sure we only see exactly the two
-  // instructions.
-  unsigned NumICmpSelectPatternInst = 0;
+  //  to make sure we only see exactly the two instructions.
+  unsigned NumCmpSelectPatternInst = 0;
   ReductionInstDesc ReduxDesc(false, 0);
 
-  // Avoid cycles in the chain.
   SmallPtrSet<Instruction *, 8> VisitedInsts;
-  while (VisitedInsts.insert(Iter)) {
-    // If the instruction has no users then this is a broken
-    // chain and can't be a reduction variable.
-    if (Iter->use_empty())
+  SmallVector<Instruction *, 8> Worklist;
+  Worklist.push_back(Phi);
+  VisitedInsts.insert(Phi);
+
+  // A value in the reduction can be used:
+  //  - By the reduction:
+  //      - Reduction operation:
+  //        - One use of reduction value (safe).
+  //        - Multiple use of reduction value (not safe).
+  //      - PHI:
+  //        - All uses of the PHI must be the reduction (safe).
+  //        - Otherwise, not safe.
+  //  - By one instruction outside of the loop (safe).
+  //  - By further instructions outside of the loop (not safe).
+  //  - By an instruction that is not part of the reduction (not safe).
+  //    This is either:
+  //      * An instruction type other than PHI or the reduction operation.
+  //      * A PHI in the header other than the initial PHI.
+  while (!Worklist.empty()) {
+    Instruction *Cur = Worklist.back();
+    Worklist.pop_back();
+
+    // No Users.
+    // If the instruction has no users then this is a broken chain and can't be
+    // a reduction variable.
+    if (Cur->use_empty())
       return false;
 
-    // Did we find a user inside this loop already ?
-    bool FoundInBlockUser = false;
-    // Did we reach the initial PHI node already ?
-    bool FoundStartPHI = false;
+    bool IsAPhi = isa<PHINode>(Cur);
 
-    // Is this a bin op ?
-    FoundBinOp |= !isa<PHINode>(Iter);
+    // A header PHI use other than the original PHI.
+    if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
+      return false;
 
-    // For each of the *users* of iter.
-    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
-         it != e; ++it) {
-      Instruction *U = cast<Instruction>(*it);
-      // We already know that the PHI is a user.
-      if (U == Phi) {
-        FoundStartPHI = true;
-        continue;
-      }
+    // Reductions of instructions such as Div, and Sub is only possible if the
+    // LHS is the reduction variable.
+    if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
+        !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
+        !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
+      return false;
+
+    // Any reduction instruction must be of one of the allowed kinds.
+    ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
+    if (!ReduxDesc.IsReduction)
+      return false;
+
+    // A reduction operation must only have one use of the reduction value.
+    if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
+        hasMultipleUsesOf(Cur, VisitedInsts))
+      return false;
+
+    // All inputs to a PHI node must be a reduction value.
+    if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
+      return false;
+
+    if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) ||
+                                     isa<SelectInst>(Cur)))
+      ++NumCmpSelectPatternInst;
+    if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) ||
+                                   isa<SelectInst>(Cur)))
+      ++NumCmpSelectPatternInst;
+
+    // Check  whether we found a reduction operator.
+    FoundReduxOp |= !IsAPhi;
+
+    // Process users of current instruction. Push non PHI nodes after PHI nodes
+    // onto the stack. This way we are going to have seen all inputs to PHI
+    // nodes once we get to them.
+    SmallVector<Instruction *, 8> NonPHIs;
+    SmallVector<Instruction *, 8> PHIs;
+    for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E;
+         ++UI) {
+      Instruction *Usr = cast<Instruction>(*UI);
 
       // Check if we found the exit user.
-      BasicBlock *Parent = U->getParent();
+      BasicBlock *Parent = Usr->getParent();
       if (!TheLoop->contains(Parent)) {
         // Exit if you find multiple outside users.
         if (ExitInstruction != 0)
           return false;
-        ExitInstruction = Iter;
-      }
-
-      // We allow in-loop PHINodes which are not the original reduction PHI
-      // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
-      // structure) then don't skip this PHI.
-      if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
-          U->getParent() != TheLoop->getHeader() &&
-          TheLoop->contains(U) &&
-          Iter->hasNUsesOrMore(2))
+        ExitInstruction = Cur;
         continue;
+      }
 
-      // We can't have multiple inside users except for a combination of
-      // icmp/select both using the phi.
-      if (FoundInBlockUser && !NumICmpSelectPatternInst)
-        return false;
-      FoundInBlockUser = true;
-
-      // Any reduction instr must be of one of the allowed kinds.
-      ReduxDesc = isReductionInstr(U, Kind, ReduxDesc);
-      if (!ReduxDesc.IsReduction)
-        return false;
+      // Process instructions only once (termination).
+      if (VisitedInsts.insert(Usr)) {
+        if (isa<PHINode>(Usr))
+          PHIs.push_back(Usr);
+        else
+          NonPHIs.push_back(Usr);
+      }
+      // Remember that we completed the cycle.
+      if (Usr == Phi)
+        FoundStartPHI = true;
+    }
+    Worklist.append(PHIs.begin(), PHIs.end());
+    Worklist.append(NonPHIs.begin(), NonPHIs.end());
+  }
 
-      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) ||
-                                       isa<SelectInst>(U)))
-          ++NumICmpSelectPatternInst;
+  // This means we have seen one but not the other instruction of the
+  // pattern or more than just a select and cmp.
+  if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
+      NumCmpSelectPatternInst != 2)
+    return false;
 
-      // Reductions of instructions such as Div, and Sub is only
-      // possible if the LHS is the reduction variable.
-      if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
-          !isa<ICmpInst>(U) && U->getOperand(0) != Iter)
-        return false;
+  if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
+    return false;
 
-      Iter = ReduxDesc.PatternLastInst;
-    }
+  // We found a reduction var if we have reached the original phi node and we
+  // only have a single instruction with out-of-loop users.
 
-    // This means we have seen one but not the other instruction of the
-    // pattern or more than just a select and cmp.
-    if (Kind == RK_IntegerMinMax && NumICmpSelectPatternInst != 2)
-      return false;
+  // This instruction is allowed to have out-of-loop users.
+  AllowedExit.insert(ExitInstruction);
 
-    // We found a reduction var if we have reached the original
-    // phi node and we only have a single instruction with out-of-loop
-    // users.
-    if (FoundStartPHI) {
-      // This instruction is allowed to have out-of-loop users.
-      AllowedExit.insert(ExitInstruction);
-
-      // Save the description of this reduction variable.
-      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
-                             ReduxDesc.MinMaxKind);
-      Reductions[Phi] = RD;
-      // We've ended the cycle. This is a reduction variable if we have an
-      // outside user and it has a binary op.
-      return FoundBinOp && ExitInstruction;
-    }
-  }
+  // Save the description of this reduction variable.
+  ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
+                         ReduxDesc.MinMaxKind);
+  Reductions[Phi] = RD;
+  // We've ended the cycle. This is a reduction variable if we have an
+  // outside user and it has a binary op.
 
-  return false;
+  return true;
 }
 
 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
 /// pattern corresponding to a min(X, Y) or max(X, Y).
 LoopVectorizationLegality::ReductionInstDesc
-LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev) {
+LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
+                                                    ReductionInstDesc &Prev) {
 
-  assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) &&
+  assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
          "Expect a select instruction");
-  ICmpInst *Cmp = 0;
+  Instruction *Cmp = 0;
   SelectInst *Select = 0;
 
   // We must handle the select(cmp()) as a single instruction. Advance to the
   // select.
-  if ((Cmp = dyn_cast<ICmpInst>(I))) {
+  if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
     if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
       return ReductionInstDesc(false, I);
     return ReductionInstDesc(Select, Prev.MinMaxKind);
@@ -2990,13 +3349,14 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionIns
   // Only handle single use cases for now.
   if (!(Select = dyn_cast<SelectInst>(I)))
     return ReductionInstDesc(false, I);
-  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))))
+  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
+      !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
     return ReductionInstDesc(false, I);
   if (!Cmp->hasOneUse())
     return ReductionInstDesc(false, I);
 
-  Value *CmpLeft = Cmp->getOperand(0);
-  Value *CmpRight = Cmp->getOperand(1);
+  Value *CmpLeft;
+  Value *CmpRight;
 
   // Look for a min/max pattern.
   if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
@@ -3007,6 +3367,14 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionIns
     return ReductionInstDesc(Select, MRK_SIntMax);
   else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
     return ReductionInstDesc(Select, MRK_SIntMin);
+  else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMin);
+  else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMax);
+  else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMin);
+  else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMax);
 
   return ReductionInstDesc(false, I);
 }
@@ -3021,7 +3389,8 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
   default:
     return ReductionInstDesc(false, I);
   case Instruction::PHI:
-      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
+      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
+                 Kind != RK_FloatMinMax))
         return ReductionInstDesc(false, I);
     return ReductionInstDesc(I, Prev.MinMaxKind);
   case Instruction::Sub:
@@ -3039,9 +3408,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
   case Instruction::FAdd:
     return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
-    if (Kind != RK_IntegerMinMax)
+    if (Kind != RK_IntegerMinMax &&
+        (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
       return ReductionInstDesc(false, I);
     return isMinMaxSelectCmpPattern(I, Prev);
   }
@@ -3106,8 +3477,12 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
 
 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    // We don't predicate loads/stores at the moment.
-    if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
+    // We might be able to hoist the load.
+    if (it->mayReadFromMemory() && !LoadSpeculation.isHoistableLoad(it))
+      return false;
+
+    // We don't predicate stores at the moment.
+    if (it->mayWriteToMemory() || it->mayThrow())
       return false;
 
     // The instructions below can trap.
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cc30cc9..40e0098 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20,6 +20,7 @@
 
 #include "VecUtils.h"
 #include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -47,7 +48,7 @@ namespace {
 
 /// The SLPVectorizer Pass.
 struct SLPVectorizer : public FunctionPass {
-  typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap;
+  typedef MapVector<Value*, BoUpSLP::StoreList> StoreListMap;
 
   /// Pass identification, replacement for typeid
   static char ID;
@@ -77,6 +78,8 @@ struct SLPVectorizer : public FunctionPass {
     if (!DL)
       return false;
 
+    DEBUG(dbgs()<<"SLP: Analyzing blocks in " << F.getName() << ".\n");
+
     for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) {
       BasicBlock *BB = it;
       bool BBChanged = false;
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index 9b94366..21e6cdd 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -46,7 +46,7 @@ namespace llvm {
 
 BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
                  TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) :
-  BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp)  {
+  Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) {
   numberInstructions();
 }
 
@@ -121,6 +121,7 @@ bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
     if (Cost < CostThreshold) {
       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands,VF)));
       vectorizeTree(Operands, VF);
       i += VF - 1;
       Changed = true;
@@ -131,7 +132,7 @@ bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
 }
 
 bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
-  ValueSet Heads, Tails;
+  SetVector<Value*> Heads, Tails;
   SmallDenseMap<Value*, Value*> ConsecutiveChain;
 
   // We may run into multiple chains that merge into a single chain. We mark the
@@ -152,7 +153,8 @@ bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
     }
 
   // For stores that start but don't end a link in the chain:
-  for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) {
+  for (SetVector<Value*>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
     if (Tails.count(*it)) continue;
 
     // We found a store instr that starts a chain. Now follow the chain and try
@@ -224,9 +226,14 @@ Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
 }
 
 void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
+  int LastIdx = getLastIndex(Operands, Operands.size());
+  Instruction *Loc = getInsertionPoint(LastIdx);
+  Builder.SetInsertPoint(Loc);
+
+  assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx  &&
+         "Vectorizing with in-tree users");
+
   Value *Vec = vectorizeTree(Operands, Operands.size());
-  BasicBlock::iterator Loc = cast<Instruction>(Vec);
-  IRBuilder<> Builder(++Loc);
   // After vectorizing the operands we need to generate extractelement
   // instructions and replace all of the uses of the scalar values with
   // the values that we extracted from the vectorized tree.
@@ -243,6 +250,16 @@ int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
   LaneMap.clear();
   MultiUserVals.clear();
   MustScalarize.clear();
+  MustExtract.clear();
+
+  // Find the location of the last root.
+  int LastRootIndex = getLastIndex(VL, VL.size());
+  int FirstUserIndex = getFirstUserIndex(VL, VL.size());
+
+  // Don't vectorize if there are users of the tree roots inside the tree
+  // itself.
+  if (LastRootIndex > FirstUserIndex)
+    return max_cost;
 
   // Scan the tree and find which value is used by which lane, and which values
   // must be scalarized.
@@ -250,7 +267,7 @@ int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
 
   // Check that instructions with multiple users can be vectorized. Mark unsafe
   // instructions.
-  for (ValueSet::iterator it = MultiUserVals.begin(),
+  for (SetVector<Value*>::iterator it = MultiUserVals.begin(),
        e = MultiUserVals.end(); it != e; ++it) {
     // Check that all of the users of this instr are within the tree
     // and that they are all from the same lane.
@@ -258,15 +275,35 @@ int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
     for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
          I != E; ++I) {
       if (LaneMap.find(*I) == LaneMap.end()) {
-        MustScalarize.insert(*it);
-        DEBUG(dbgs()<<"SLP: Adding " << **it <<
-              " to MustScalarize because of an out of tree usage.\n");
-        break;
+        DEBUG(dbgs()<<"SLP: Instr " << **it << " has multiple users.\n");
+
+        // We don't have an ordering problem if the user is not in this basic
+        // block.
+        Instruction *Inst = cast<Instruction>(*I);
+        if (Inst->getParent() != BB) {
+          MustExtract.insert(*it);
+          continue;
+        }
+
+        // We don't have an ordering problem if the user is after the last root.
+        int Idx = InstrIdx[Inst];
+        if (Idx < LastRootIndex) {
+          MustScalarize.insert(*it);
+          DEBUG(dbgs()<<"SLP: Adding to MustScalarize "
+                "because of an unsafe out of tree usage.\n");
+          break;
+        }
+
+
+        DEBUG(dbgs()<<"SLP: Adding to MustExtract "
+              "because of a safe out of tree usage.\n");
+        MustExtract.insert(*it);
+        continue;
       }
       if (Lane == -1) Lane = LaneMap[*I];
       if (Lane != LaneMap[*I]) {
         MustScalarize.insert(*it);
-        DEBUG(dbgs()<<"Adding " << **it <<
+        DEBUG(dbgs()<<"SLP: Adding " << **it <<
               " to MustScalarize because multiple lane use it: "
               << Lane << " and " << LaneMap[*I] << ".\n");
         break;
@@ -311,14 +348,6 @@ void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
     if (!I || Opcode != I->getOpcode()) return;
   }
 
-  // Mark instructions with multiple users.
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // Remember to check if all of the users of this instr are vectorized
-    // within our tree.
-    if (I && I->getNumUses() > 1) MultiUserVals.insert(I);
-  }
-
   for (int i = 0, e = VL.size(); i < e; ++i) {
     // Check that the instruction is only used within
     // one lane.
@@ -327,6 +356,19 @@ void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
     LaneMap[VL[i]] = i;
   }
 
+  // Mark instructions with multiple users.
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // Remember to check if all of the users of this instr are vectorized
+    // within our tree. At depth zero we have no local users, only external
+    // users that we don't care about.
+    if (Depth && I && I->getNumUses() > 1) {
+      DEBUG(dbgs()<<"SLP: Adding to MultiUserVals "
+            "because it has multiple users:" << *I << " \n");
+      MultiUserVals.insert(I);
+    }
+  }
+
   switch (Opcode) {
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -440,11 +482,9 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
 
   // Check if it is safe to sink the loads or the stores.
   if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
-    int MaxIdx = InstrIdx[VL0];
-    for (unsigned i = 1, e = VL.size(); i < e; ++i )
-      MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
-
+    int MaxIdx = getLastIndex(VL, VL.size());
     Instruction *Last = InstrVec[MaxIdx];
+
     for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
       if (VL[i] == Last) continue;
       Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
@@ -456,6 +496,13 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
     }
   }
 
+  // Calculate the extract cost.
+  unsigned ExternalUserExtractCost = 0;
+  for (unsigned i = 0, e = VL.size(); i < e; ++i)
+    if (MustExtract.count(VL[i]))
+      ExternalUserExtractCost +=
+        TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+
   switch (Opcode) {
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -469,7 +516,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    int Cost = 0;
+    int Cost = ExternalUserExtractCost;
     ValueList Operands;
     Type *SrcTy = VL0->getOperand(0)->getType();
     // Prepare the operand vector.
@@ -510,7 +557,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
-    int Cost = 0;
+    int Cost = ExternalUserExtractCost;
     // Calculate the cost of all of the operands.
     for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
       ValueList Operands;
@@ -540,7 +587,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
     int ScalarLdCost = VecTy->getNumElements() *
       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
     int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-    return VecLdCost - ScalarLdCost;
+    return VecLdCost - ScalarLdCost + ExternalUserExtractCost;
   }
   case Instruction::Store: {
     // We know that we can merge the stores. Calculate the cost.
@@ -556,7 +603,7 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
     }
 
     int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
-    return TotalCost;
+    return TotalCost + ExternalUserExtractCost;
   }
   default:
     // Unable to vectorize unknown instructions.
@@ -564,15 +611,40 @@ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
   }
 }
 
-Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) {
+int BoUpSLP::getLastIndex(ArrayRef<Value *> VL, unsigned VF) {
   int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
   for (unsigned i = 0; i < VF; ++i )
     MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
-  return InstrVec[MaxIdx + 1];
+  return MaxIdx;
+}
+
+int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF) {
+  // Find the first user of the values.
+  int FirstUser = InstrVec.size();
+  for (unsigned i = 0; i < VF; ++i) {
+    for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
+         U != UE; ++U) {
+      Instruction *Instr = dyn_cast<Instruction>(*U);
+      if (!Instr || Instr->getParent() != BB)
+        continue;
+
+      FirstUser = std::min(FirstUser, InstrIdx[Instr]);
+    }
+  }
+  return FirstUser;
+}
+
+int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) {
+  assert(I->getParent() == BB && "Invalid parent for instruction I");
+  assert(J->getParent() == BB && "Invalid parent for instruction J");
+  return std::max(InstrIdx[I],InstrIdx[J]);
+}
+
+Instruction *BoUpSLP::getInsertionPoint(unsigned Index) {
+  return InstrVec[Index + 1];
 }
 
 Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
-  IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
   Value *Vec = UndefValue::get(Ty);
   for (unsigned i=0; i < Ty->getNumElements(); ++i) {
     // Generate the 'InsertElement' instruction.
@@ -583,15 +655,51 @@ Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
     GatherInstructions.push_back(Vec);
   }
 
+  for (unsigned i = 0; i < Ty->getNumElements(); ++i)
+    VectorizedValues[VL[i]] = Vec;
+
   return Vec;
 }
 
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
   Value *V = vectorizeTree_rec(VL, VF);
+
+  int LastInstrIdx = getLastIndex(VL, VL.size());
+  for (SetVector<Value*>::iterator it = MustExtract.begin(),
+       e = MustExtract.end(); it != e; ++it) {
+    Instruction *I = cast<Instruction>(*it);
+
+    // This is a scalarized value, so we can use the original value.
+    // No need to extract from the vector.
+    if (!LaneMap.count(I))
+      continue;
+
+    Value *Vec = VectorizedValues[I];
+    // We decided not to vectorize I because one of its users was not
+    // vectorizerd. This is okay.
+    if (!Vec)
+      continue;
+
+    Value *Idx = Builder.getInt32(LaneMap[I]);
+    Value *Extract = Builder.CreateExtractElement(Vec, Idx);
+    bool Replaced = false;
+    for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U != UE;
+         ++U) {
+      Instruction *UI = cast<Instruction>(*U);
+      if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx)
+        UI->replaceUsesOfWith(I ,Extract);
+      Replaced = true;
+    }
+    assert(Replaced && "Must replace at least one outside user");
+    (void)Replaced;
+  }
+
   // We moved some instructions around. We have to number them again
   // before we can do any analysis.
   numberInstructions();
   MustScalarize.clear();
+  MustExtract.clear();
+  VectorizedValues.clear();
   return V;
 }
 
@@ -614,19 +722,27 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
   }
 
   // Check that this is a simple vector constant.
-  if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
+  if (AllConst || AllSameScalar)
+    return Scalarize(VL, VecTy);
 
   // Scalarize unknown structures.
   Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
-  if (!VL0) return Scalarize(VL, VecTy);
+  if (!VL0)
+    return Scalarize(VL, VecTy);
 
-  if (VectorizedValues.count(VL0)) return VectorizedValues[VL0];
+  if (VectorizedValues.count(VL0)) {
+    Value * Vec = VectorizedValues[VL0];
+    for (int i = 0; i < VF; ++i)
+      VectorizedValues[VL[i]] = Vec;
+    return Vec;
+  }
 
   unsigned Opcode = VL0->getOpcode();
   for (unsigned i = 0, e = VF; i < e; ++i) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
     // If not all of the instructions are identical then we have to scalarize.
-    if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy);
+    if (!I || Opcode != I->getOpcode())
+      return Scalarize(VL, VecTy);
   }
 
   switch (Opcode) {
@@ -646,10 +762,12 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
     for (int i = 0; i < VF; ++i)
       INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
     Value *InVec = vectorizeTree_rec(INVL, VF);
-    IRBuilder<> Builder(GetLastInstr(VL, VF));
     CastInst *CI = dyn_cast<CastInst>(VL0);
     Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
-    VectorizedValues[VL0] = V;
+
+    for (int i = 0; i < VF; ++i)
+      VectorizedValues[VL[i]] = V;
+
     return V;
   }
   case Instruction::Add:
@@ -672,16 +790,18 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
   case Instruction::Xor: {
     ValueList LHSVL, RHSVL;
     for (int i = 0; i < VF; ++i) {
-      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
-      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
+      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
     }
 
-    Value *RHS = vectorizeTree_rec(RHSVL, VF);
     Value *LHS = vectorizeTree_rec(LHSVL, VF);
-    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    Value *RHS = vectorizeTree_rec(RHSVL, VF);
     BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
-    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
-    VectorizedValues[VL0] = V;
+    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS,RHS);
+
+    for (int i = 0; i < VF; ++i)
+      VectorizedValues[VL[i]] = V;
+
     return V;
   }
   case Instruction::Load: {
@@ -693,12 +813,18 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
       if (!isConsecutiveAccess(VL[i-1], VL[i]))
         return Scalarize(VL, VecTy);
 
-    IRBuilder<> Builder(GetLastInstr(VL, VF));
-    Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
-                                          VecTy->getPointerTo());
-    LI = Builder.CreateLoad(VecPtr);
+    // Loads are inserted at the head of the tree because we don't want to sink
+    // them all the way down past store instructions.
+    Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size()));
+    IRBuilder<> LoadBuilder(Loc);
+    Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(),
+                                              VecTy->getPointerTo());
+    LI = LoadBuilder.CreateLoad(VecPtr);
     LI->setAlignment(Alignment);
-    VectorizedValues[VL0] = LI;
+
+    for (int i = 0; i < VF; ++i)
+      VectorizedValues[VL[i]] = LI;
+
     return LI;
   }
   case Instruction::Store: {
@@ -710,8 +836,6 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
       ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
 
     Value *VecValue = vectorizeTree_rec(ValueOp, VF);
-
-    IRBuilder<> Builder(GetLastInstr(VL, VF));
     Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
                                           VecTy->getPointerTo());
     Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
@@ -721,9 +845,7 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
     return 0;
   }
   default:
-    Value *S = Scalarize(VL, VecTy);
-    VectorizedValues[VL0] = S;
-    return S;
+    return Scalarize(VL, VecTy);
   }
 }
 
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
index 5456c6c..d41d2ed 100644
--- a/lib/Transforms/Vectorize/VecUtils.h
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -16,9 +16,11 @@
 #define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/IRBuilder.h"
 #include <vector>
 
 namespace llvm {
@@ -107,9 +109,19 @@ private:
   /// \returns the pointer to the barrier instruction if we can't sink.
   Value *isUnsafeToSink(Instruction *Src, Instruction *Dst);
 
-  /// \returns the instruction that appears last in the BB from \p VL.
+  /// \returns the index of the last instrucion in the BB from \p VL.
   /// Only consider the first \p VF elements.
-  Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF);
+  int getLastIndex(ArrayRef<Value *> VL, unsigned VF);
+
+  /// \returns the index of the first User of \p VL.
+  /// Only consider the first \p VF elements.
+  int getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF);
+
+  /// \returns the instruction \p I or \p Jt hat appears last in the BB .
+  int getLastIndex(Instruction *I, Instruction *J);
+
+  /// \returns the insertion point for \p Index.
+  Instruction *getInsertionPoint(unsigned Index);
 
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty);
@@ -127,15 +139,20 @@ private:
   /// NOTICE: The vectorization methods also use this set.
   ValueSet MustScalarize;
 
+  /// Contains values that have users outside of the vectorized graph.
+  /// We need to generate extract instructions for these values.
+  /// NOTICE: The vectorization methods also use this set.
+  SetVector<Value*> MustExtract;
+
   /// Contains a list of values that are used outside the current tree. This
   /// set must be reset between runs.
-  ValueSet MultiUserVals;
+  SetVector<Value*> MultiUserVals;
   /// Maps values in the tree to the vector lanes that uses them. This map must
   /// be reset between runs of getCost.
   std::map<Value*, int> LaneMap;
   /// A list of instructions to ignore while sinking
   /// memory instructions. This map must be reset between runs of getCost.
-  SmallPtrSet<Value *, 8> MemBarrierIgnoreList;
+  ValueSet MemBarrierIgnoreList;
 
   // -- Containers that are used during vectorizeTree -- //
 
@@ -150,6 +167,9 @@ private:
   /// Iterating over this list is faster than calling LICM.
   ValueList GatherInstructions;
 
+  /// Instruction builder to construct the vectorized tree.
+  IRBuilder<> Builder;
+
   // Analysis and block reference.
   BasicBlock *BB;
   ScalarEvolution *SE;