52 files changed, 4803 insertions, 1806 deletions
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index 8336d3a..b63495b 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -66,9 +66,8 @@ ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); }
 static void FindUsedValues(GlobalVariable *LLVMUsed,
                            SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
   if (LLVMUsed == 0) return;
-  ConstantArray *Inits = dyn_cast<ConstantArray>(LLVMUsed->getInitializer());
-  if (Inits == 0) return;
-  
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
     if (GlobalValue *GV = 
         dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index a75212a..bc5109b 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1,4 +1,4 @@
-//===- FunctionAttrs.cpp - Pass which marks functions readnone or readonly ===//
+//===- FunctionAttrs.cpp - Pass which marks functions attributes ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,6 +14,8 @@
 // to the function does not create any copies of the pointer value that
 // outlive the call.  This more or less means that the pointer is only
 // dereferenced, and not returned from the function or stored in a global.
+// Finally, well-known library call declarations are marked with all
+// attributes that are consistent with the function's standard definition.
 // This pass is implemented as a bottom-up traversal of the call-graph.
 //
 //===----------------------------------------------------------------------===//
@@ -32,12 +34,14 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 
 STATISTIC(NumReadNone, "Number of functions marked readnone");
 STATISTIC(NumReadOnly, "Number of functions marked readonly");
 STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
 STATISTIC(NumNoAlias, "Number of function returns marked noalias");
+STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
 namespace {
   struct FunctionAttrs : public CallGraphSCCPass {
@@ -62,14 +66,63 @@ namespace {
     // AddNoAliasAttrs - Deduce noalias attributes for the SCC.
     bool AddNoAliasAttrs(const CallGraphSCC &SCC);
 
+    // Utility methods used by inferPrototypeAttributes to add attributes
+    // and maintain annotation statistics.
+
+    void setDoesNotAccessMemory(Function &F) {
+      if (!F.doesNotAccessMemory()) {
+	F.setDoesNotAccessMemory();
+	++NumAnnotated;
+      }
+    }
+
+    void setOnlyReadsMemory(Function &F) {
+      if (!F.onlyReadsMemory()) {
+	F.setOnlyReadsMemory();
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotThrow(Function &F) {
+      if (!F.doesNotThrow()) {
+	F.setDoesNotThrow();
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotCapture(Function &F, unsigned n) {
+      if (!F.doesNotCapture(n)) {
+	F.setDoesNotCapture(n);
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotAlias(Function &F, unsigned n) {
+      if (!F.doesNotAlias(n)) {
+	F.setDoesNotAlias(n);
+	++NumAnnotated;
+      }
+    }
+
+    // inferPrototypeAttributes - Analyze the name and prototype of the
+    // given function and set any applicable attributes.  Returns true
+    // if any attributes were set and false otherwise.
+    bool inferPrototypeAttributes(Function &F);
+
+    // annotateLibraryCalls - Adds attributes to well-known standard library
+    // call declarations.
+    bool annotateLibraryCalls(const CallGraphSCC &SCC);
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
       AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfo>();
       CallGraphSCCPass::getAnalysisUsage(AU);
     }
 
   private:
     AliasAnalysis *AA;
+    TargetLibraryInfo *TLI;
   };
 }
 
@@ -77,6 +130,7 @@ char FunctionAttrs::ID = 0;
 INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
 INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
 
@@ -598,10 +652,693 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {
   return MadeChange;
 }
 
+/// inferPrototypeAttributes - Analyze the name and prototype of the
+/// given function and set any applicable attributes.  Returns true
+/// if any attributes were set and false otherwise.
+bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
+  FunctionType *FTy = F.getFunctionType();
+  LibFunc::Func TheLibFunc;
+  if (!(TLI->getLibFunc(F.getName(), TheLibFunc) && TLI->has(TheLibFunc)))
+    return false;
+
+  switch (TheLibFunc) {
+  case LibFunc::strlen:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::strchr:
+  case LibFunc::strrchr:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isIntegerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::strcpy:
+  case LibFunc::stpcpy:
+  case LibFunc::strcat:
+  case LibFunc::strtol:
+  case LibFunc::strtod:
+  case LibFunc::strtof:
+  case LibFunc::strtoul:
+  case LibFunc::strtoll:
+  case LibFunc::strtold:
+  case LibFunc::strncat:
+  case LibFunc::strncpy:
+  case LibFunc::stpncpy:
+  case LibFunc::strtoull:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strxfrm:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strcmp:
+  case LibFunc::strspn:
+  case LibFunc::strncmp:
+  case LibFunc::strcspn:
+  case LibFunc::strcoll:
+  case LibFunc::strcasecmp:
+  case LibFunc::strncasecmp:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strstr:
+  case LibFunc::strpbrk:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strtok:
+  case LibFunc::strtok_r:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::scanf:
+  case LibFunc::setbuf:
+  case LibFunc::setvbuf:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::strdup:
+  case LibFunc::strndup:
+    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::stat:
+  case LibFunc::sscanf:
+  case LibFunc::sprintf:
+  case LibFunc::statvfs:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::snprintf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::setitimer:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::system:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "system" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::malloc:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::memcmp:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::memchr:
+  case LibFunc::memrchr:
+    if (FTy->getNumParams() != 3)
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::modf:
+  case LibFunc::modff:
+  case LibFunc::modfl:
+  case LibFunc::memcpy:
+  case LibFunc::memccpy:
+  case LibFunc::memmove:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::memalign:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::mkdir:
+  case LibFunc::mktime:
+    if (FTy->getNumParams() == 0 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::realloc:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::read:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "read" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::rmdir:
+  case LibFunc::rewind:
+  case LibFunc::remove:
+  case LibFunc::realpath:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::rename:
+  case LibFunc::readlink:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::write:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "write" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bcopy:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bcmp:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bzero:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::calloc:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::chmod:
+  case LibFunc::chown:
+  case LibFunc::ctermid:
+  case LibFunc::clearerr:
+  case LibFunc::closedir:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::atoi:
+  case LibFunc::atol:
+  case LibFunc::atof:
+  case LibFunc::atoll:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::access:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::fopen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fdopen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::feof:
+  case LibFunc::free:
+  case LibFunc::fseek:
+  case LibFunc::ftell:
+  case LibFunc::fgetc:
+  case LibFunc::fseeko:
+  case LibFunc::ftello:
+  case LibFunc::fileno:
+  case LibFunc::fflush:
+  case LibFunc::fclose:
+  case LibFunc::fsetpos:
+  case LibFunc::flockfile:
+  case LibFunc::funlockfile:
+  case LibFunc::ftrylockfile:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::ferror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F);
+    break;
+  case LibFunc::fputc:
+  case LibFunc::fstat:
+  case LibFunc::frexp:
+  case LibFunc::frexpf:
+  case LibFunc::frexpl:
+  case LibFunc::fstatvfs:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fgets:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 3);
+  case LibFunc::fread:
+  case LibFunc::fwrite:
+    if (FTy->getNumParams() != 4 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(3)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 4);
+  case LibFunc::fputs:
+  case LibFunc::fscanf:
+  case LibFunc::fprintf:
+  case LibFunc::fgetpos:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::getc:
+  case LibFunc::getlogin_r:
+  case LibFunc::getc_unlocked:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::getenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::gets:
+  case LibFunc::getchar:
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::getitimer:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::getpwnam:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::ungetc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::uname:
+  case LibFunc::unlink:
+  case LibFunc::unsetenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::utime:
+  case LibFunc::utimes:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::puts:
+  case LibFunc::printf:
+  case LibFunc::perror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::pread:
+  case LibFunc::pwrite:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; these are valid pthread cancellation points.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::putchar:
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::popen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::pclose:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vscanf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vsscanf:
+  case LibFunc::vfscanf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::valloc:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::vprintf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vfprintf:
+  case LibFunc::vsprintf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::vsnprintf:
+    if (FTy->getNumParams() != 4 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::open:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::opendir:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::tmpfile:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::times:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::htonl:
+  case LibFunc::htons:
+  case LibFunc::ntohl:
+  case LibFunc::ntohs:
+    setDoesNotThrow(F);
+    setDoesNotAccessMemory(F);
+    break;
+  case LibFunc::lstat:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::lchown:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::qsort:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
+      return false;
+    // May throw; places call through function pointer.
+    setDoesNotCapture(F, 4);
+    break;
+  case LibFunc::dunder_strdup:
+  case LibFunc::dunder_strndup:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::dunder_strtok_r:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::under_IO_getc:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::under_IO_putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::dunder_isoc99_scanf:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::stat64:
+  case LibFunc::lstat64:
+  case LibFunc::statvfs64:
+  case LibFunc::dunder_isoc99_sscanf:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fopen64:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fseeko64:
+  case LibFunc::ftello64:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::tmpfile64:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::fstat64:
+  case LibFunc::fstatvfs64:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::open64:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  default:
+    // Didn't mark any attributes.
+    return false;
+  }
+
+  return true;
+}
+
+/// annotateLibraryCalls - Adds attributes to well-known standard library
+/// call declarations.
+bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) {
+  bool MadeChange = false;
+
+  // Check each function in turn annotating well-known library function
+  // declarations with attributes.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+
+    if (F != 0 && F->isDeclaration())
+      MadeChange |= inferPrototypeAttributes(*F);
+  }
+
+  return MadeChange;
+}
+
 bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
   AA = &getAnalysis<AliasAnalysis>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
 
-  bool Changed = AddReadAttrs(SCC);
+  bool Changed = annotateLibraryCalls(SCC);
+  Changed |= AddReadAttrs(SCC);
   Changed |= AddNoCaptureAttrs(SCC);
   Changed |= AddNoAliasAttrs(SCC);
   return Changed;
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index dc99492..201f320 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -42,6 +42,7 @@ namespace {
 
   private:
     SmallPtrSet<GlobalValue*, 32> AliveGlobals;
+    SmallPtrSet<Constant *, 8> SeenConstants;
 
     /// GlobalIsNeeded - mark the specific global value as needed, and
     /// recursively mark anything that it uses as also needed.
@@ -151,6 +152,7 @@ bool GlobalDCE::runOnModule(Module &M) {
 
   // Make sure that all memory is released
   AliveGlobals.clear();
+  SeenConstants.clear();
 
   return Changed;
 }
@@ -190,12 +192,15 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
 void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {
   if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return GlobalIsNeeded(GV);
-  
+
   // Loop over all of the operands of the constant, adding any globals they
   // use to the list of needed globals.
-  for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I)
-    if (Constant *OpC = dyn_cast<Constant>(*I))
-      MarkUsedGlobalsAsNeeded(OpC);
+  for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I) {
+    // If we've already processed this constant there's no need to do it again.
+    Constant *Op = dyn_cast<Constant>(*I);
+    if (Op && SeenConstants.insert(Op))
+      MarkUsedGlobalsAsNeeded(Op);
+  }
 }
 
 // RemoveUnusedGlobalValue - Loop over all of the uses of the specified
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 2b9d667..b035a82 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -470,8 +470,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
 static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
                                        DataLayout *TD, TargetLibraryInfo *TLI) {
   bool Changed = false;
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) {
-    User *U = *UI++;
+  SmallVector<User*, 8> WorkList(V->use_begin(), V->use_end());
+  while (!WorkList.empty()) {
+    User *U = WorkList.pop_back_val();
 
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (Init) {
@@ -534,7 +535,6 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       // us, and if they are all dead, nuke them without remorse.
       if (SafeToDestroyConstant(C)) {
         C->destroyConstant();
-        // This could have invalidated UI, start over from scratch.
         CleanupConstantGlobalUsers(V, Init, TD, TLI);
         return true;
       }
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 892100f..4ce749c 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -72,6 +72,15 @@ STATISTIC(NumThunksWritten, "Number of thunks generated");
 STATISTIC(NumAliasesWritten, "Number of aliases generated");
 STATISTIC(NumDoubleWeak, "Number of new functions created");
 
+/// Returns the type id for a type to be hashed. We turn pointer types into
+/// integers here because the actual compare logic below considers pointers and
+/// integers of the same size as equal.
+static Type::TypeID getTypeIDForHash(Type *Ty) {
+  if (Ty->isPointerTy())
+    return Type::IntegerTyID;
+  return Ty->getTypeID();
+}
+
 /// Creates a hash-code for the function which is the same for any two
 /// functions that will compare equal, without looking at the instructions
 /// inside the function.
@@ -83,9 +92,9 @@ static unsigned profileFunction(const Function *F) {
   ID.AddInteger(F->getCallingConv());
   ID.AddBoolean(F->hasGC());
   ID.AddBoolean(FTy->isVarArg());
-  ID.AddInteger(FTy->getReturnType()->getTypeID());
+  ID.AddInteger(getTypeIDForHash(FTy->getReturnType()));
   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-    ID.AddInteger(FTy->getParamType(i)->getTypeID());
+    ID.AddInteger(getTypeIDForHash(FTy->getParamType(i)));
   return ID.ComputeHash();
 }
 
@@ -200,8 +209,7 @@ private:
 
 // Any two pointers in the same address space are equivalent, intptr_t and
 // pointers are equivalent. Otherwise, standard type equivalence rules apply.
-bool FunctionComparator::isEquivalentType(Type *Ty1,
-                                          Type *Ty2) const {
+bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
   if (Ty1 == Ty2)
     return true;
   if (Ty1->getTypeID() != Ty2->getTypeID()) {
@@ -740,7 +748,13 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   if (NewG->getReturnType()->isVoidTy()) {
     Builder.CreateRetVoid();
   } else {
-    Builder.CreateRet(Builder.CreateBitCast(CI, NewG->getReturnType()));
+    Type *RetTy = NewG->getReturnType();
+    if (CI->getType()->isIntegerTy() && RetTy->isPointerTy())
+      Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy));
+    else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy())
+      Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy));
+    else
+      Builder.CreateRet(Builder.CreateBitCast(CI, RetTy));
   }
 
   NewG->copyAttributesFrom(G);
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 027a9f2..986c0b8 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -33,7 +33,12 @@ RunLoopVectorization("vectorize-loops",
                      cl::desc("Run the Loop vectorization passes"));
 
 static cl::opt<bool>
-RunBBVectorization("vectorize", cl::desc("Run the BB vectorization passes"));
+RunSLPVectorization("vectorize-slp",
+                    cl::desc("Run the SLP vectorization passes"));
+
+static cl::opt<bool>
+RunBBVectorization("vectorize-slp-aggressive",
+                    cl::desc("Run the BB vectorization passes"));
 
 static cl::opt<bool>
 UseGVNAfterVectorization("use-gvn-after-vectorization",
@@ -52,7 +57,8 @@ PassManagerBuilder::PassManagerBuilder() {
     DisableSimplifyLibCalls = false;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
-    Vectorize = RunBBVectorization;
+    BBVectorize = RunBBVectorization;
+    SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
 }
 
@@ -207,7 +213,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
-  if (Vectorize) {
+  if (SLPVectorize)
+    MPM.add(createSLPVectorizerPass());     // Vectorize parallel scalar chains.
+
+  if (BBVectorize) {
     MPM.add(createBBVectorizePass());
     MPM.add(createInstructionCombiningPass());
     if (OptLevel > 1 && UseGVNAfterVectorization)
@@ -321,6 +330,14 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createGlobalDCEPass());
 }
 
+inline PassManagerBuilder *unwrap(LLVMPassManagerBuilderRef P) {
+    return reinterpret_cast<PassManagerBuilder*>(P);
+}
+
+inline LLVMPassManagerBuilderRef wrap(PassManagerBuilder *P) {
+  return reinterpret_cast<LLVMPassManagerBuilderRef>(P);
+}
+
 LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
   PassManagerBuilder *PMB = new PassManagerBuilder();
   return wrap(PMB);
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 5f8681f..3396f79 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -195,10 +195,9 @@ static void findUsedValues(GlobalVariable *LLVMUsed,
                            SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
   if (LLVMUsed == 0) return;
   UsedValues.insert(LLVMUsed);
-  
-  ConstantArray *Inits = dyn_cast<ConstantArray>(LLVMUsed->getInitializer());
-  if (Inits == 0) return;
-  
+
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
     if (GlobalValue *GV = 
           dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt
index 72cfe2c..a25696e 100644
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@@ -9,7 +9,7 @@ add_llvm_library(LLVMInstCombine
   InstCombineMulDivRem.cpp
   InstCombinePHI.cpp
   InstCombineSelect.cpp
-  InstCombineShifts.cpp 
+  InstCombineShifts.cpp
   InstCombineSimplifyDemanded.cpp
   InstCombineVectorOps.cpp
   )
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 1f6a3a5e..2a36074 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -233,6 +233,7 @@ private:
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
   bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
   Value *EmitGEPOffset(User *GEP);
+  Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
 
 public:
   // InsertNewInstBefore - insert an instruction New before instruction Old
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 3c5781c..b96eb51 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -24,9 +24,9 @@ namespace {
   /// Class representing coefficient of floating-point addend.
   /// This class needs to be highly efficient, which is especially true for
   /// the constructor. As of I write this comment, the cost of the default
-  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to 
+  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to
   /// perform write-merging).
-  /// 
+  ///
   class FAddendCoef {
   public:
     // The constructor has to initialize a APFloat, which is uncessary for
@@ -37,31 +37,31 @@ namespace {
     //
     FAddendCoef() : IsFp(false), BufHasFpVal(false), IntVal(0) {}
     ~FAddendCoef();
-  
+
     void set(short C) {
       assert(!insaneIntVal(C) && "Insane coefficient");
       IsFp = false; IntVal = C;
     }
-  
+
     void set(const APFloat& C);
-  
+
     void negate();
-  
+
     bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
     Value *getValue(Type *) const;
-  
+
     // If possible, don't define operator+/operator- etc because these
     // operators inevitably call FAddendCoef's constructor which is not cheap.
     void operator=(const FAddendCoef &A);
     void operator+=(const FAddendCoef &A);
     void operator-=(const FAddendCoef &A);
     void operator*=(const FAddendCoef &S);
-  
+
     bool isOne() const { return isInt() && IntVal == 1; }
     bool isTwo() const { return isInt() && IntVal == 2; }
     bool isMinusOne() const { return isInt() && IntVal == -1; }
     bool isMinusTwo() const { return isInt() && IntVal == -2; }
-  
+
   private:
     bool insaneIntVal(int V) { return V > 4 || V < -4; }
     APFloat *getFpValPtr(void)
@@ -74,18 +74,28 @@ namespace {
       return *getFpValPtr();
     }
 
-    APFloat &getFpVal(void)
-      { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); }
-  
+    APFloat &getFpVal(void) {
+      assert(IsFp && BufHasFpVal && "Incorret state");
+      return *getFpValPtr();
+    }
+
     bool isInt() const { return !IsFp; }
 
+    // If the coefficient is represented by an integer, promote it to a
+    // floating point.
+    void convertToFpType(const fltSemantics &Sem);
+
+    // Construct an APFloat from a signed integer.
+    // TODO: We should get rid of this function when APFloat can be constructed
+    //       from an *SIGNED* integer.
+    APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);
   private:
 
     bool IsFp;
-  
+
     // True iff FpValBuf contains an instance of APFloat.
     bool BufHasFpVal;
-  
+
     // The integer coefficient of an individual addend is either 1 or -1,
     // and we try to simplify at most 4 addends from neighboring at most
     // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
@@ -94,7 +104,7 @@ namespace {
 
     AlignedCharArrayUnion<APFloat> FpValBuf;
   };
-  
+
   /// FAddend is used to represent floating-point addend. An addend is
   /// represented as <C, V>, where the V is a symbolic value, and C is a
   /// constant coefficient. A constant addend is represented as <C, 0>.
@@ -102,10 +112,10 @@ namespace {
   class FAddend {
   public:
     FAddend() { Val = 0; }
-  
+
     Value *getSymVal (void) const { return Val; }
     const FAddendCoef &getCoef(void) const { return Coeff; }
-  
+
     bool isConstant() const { return Val == 0; }
     bool isZero() const { return Coeff.isZero(); }
 
@@ -114,17 +124,17 @@ namespace {
       { Coeff.set(Coefficient); Val = V; }
     void set(const ConstantFP* Coefficient, Value *V)
       { Coeff.set(Coefficient->getValueAPF()); Val = V; }
-  
+
     void negate() { Coeff.negate(); }
-  
+
     /// Drill down the U-D chain one step to find the definition of V, and
     /// try to break the definition into one or two addends.
     static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);
-  
+
     /// Similar to FAddend::drillDownOneStep() except that the value being
     /// splitted is the addend itself.
     unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
-  
+
     void operator+=(const FAddend &T) {
       assert((Val == T.Val) && "Symbolic-values disagree");
       Coeff += T.Coeff;
@@ -132,12 +142,12 @@ namespace {
 
   private:
     void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
-  
+
     // This addend has the value of "Coeff * Val".
     Value *Val;
     FAddendCoef Coeff;
   };
-  
+
   /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
   /// with its neighboring at most two instructions.
   ///
@@ -145,17 +155,17 @@ namespace {
   public:
     FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
     Value *simplify(Instruction *FAdd);
-  
+
   private:
     typedef SmallVector<const FAddend*, 4> AddendVect;
-  
+
     Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
 
     Value *performFactorization(Instruction *I);
 
     /// Convert given addend to a Value
     Value *createAddendVal(const FAddend &A, bool& NeedNeg);
-    
+
     /// Return the number of instructions needed to emit the N-ary addition.
     unsigned calcInstrNumber(const AddendVect& Vect);
     Value *createFSub(Value *Opnd0, Value *Opnd1);
@@ -165,10 +175,10 @@ namespace {
     Value *createFNeg(Value *V);
     Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
     void createInstPostProc(Instruction *NewInst);
-  
+
     InstCombiner::BuilderTy *Builder;
     Instruction *Instr;
-  
+
   private:
      // Debugging stuff are clustered here.
     #ifndef NDEBUG
@@ -180,7 +190,7 @@ namespace {
       void incCreateInstNum() {}
     #endif
   };
-} 
+}
 
 //===----------------------------------------------------------------------===//
 //
@@ -203,10 +213,34 @@ void FAddendCoef::set(const APFloat& C) {
   } else
     *P = C;
 
-  IsFp = BufHasFpVal = true; 
+  IsFp = BufHasFpVal = true;
 }
 
-void FAddendCoef::operator=(const FAddendCoef& That) {
+void FAddendCoef::convertToFpType(const fltSemantics &Sem) {
+  if (!isInt())
+    return;
+
+  APFloat *P = getFpValPtr();
+  if (IntVal > 0)
+    new(P) APFloat(Sem, IntVal);
+  else {
+    new(P) APFloat(Sem, 0 - IntVal);
+    P->changeSign();
+  }
+  IsFp = BufHasFpVal = true;
+}
+
+APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) {
+  if (Val >= 0)
+    return APFloat(Sem, Val);
+
+  APFloat T(Sem, 0 - Val);
+  T.changeSign();
+
+  return T;
+}
+
+void FAddendCoef::operator=(const FAddendCoef &That) {
   if (That.isInt())
     set(That.IntVal);
   else
@@ -222,16 +256,16 @@ void FAddendCoef::operator+=(const FAddendCoef &That) {
       getFpVal().add(That.getFpVal(), RndMode);
     return;
   }
-  
+
   if (isInt()) {
     const APFloat &T = That.getFpVal();
-    set(T);
-    getFpVal().add(APFloat(T.getSemantics(), IntVal), RndMode);
+    convertToFpType(T.getSemantics());
+    getFpVal().add(T, RndMode);
     return;
   }
-  
+
   APFloat &T = getFpVal();
-  T.add(APFloat(T.getSemantics(), That.IntVal), RndMode);
+  T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
 }
 
 void FAddendCoef::operator-=(const FAddendCoef &That) {
@@ -243,16 +277,16 @@ void FAddendCoef::operator-=(const FAddendCoef &That) {
       getFpVal().subtract(That.getFpVal(), RndMode);
     return;
   }
-  
+
   if (isInt()) {
     const APFloat &T = That.getFpVal();
-    set(T);
-    getFpVal().subtract(APFloat(T.getSemantics(), IntVal), RndMode);
+    convertToFpType(T.getSemantics());
+    getFpVal().subtract(T, RndMode);
     return;
   }
 
   APFloat &T = getFpVal();
-  T.subtract(APFloat(T.getSemantics(), IntVal), RndMode);
+  T.subtract(createAPFloatFromInt(T.getSemantics(), IntVal), RndMode);
 }
 
 void FAddendCoef::operator*=(const FAddendCoef &That) {
@@ -271,15 +305,16 @@ void FAddendCoef::operator*=(const FAddendCoef &That) {
     return;
   }
 
-  const fltSemantics &Semantic = 
+  const fltSemantics &Semantic =
     isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();
 
   if (isInt())
-    set(APFloat(Semantic, IntVal));
+    convertToFpType(Semantic);
   APFloat &F0 = getFpVal();
 
   if (That.isInt())
-    F0.multiply(APFloat(Semantic, That.IntVal), APFloat::rmNearestTiesToEven);
+    F0.multiply(createAPFloatFromInt(Semantic, That.IntVal),
+                APFloat::rmNearestTiesToEven);
   else
     F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
 
@@ -305,11 +340,11 @@ Value *FAddendCoef::getValue(Type *Ty) const {
 //  A - B                     <1, A>, <1,B>
 //  0 - B                     <-1, B>
 //  C * A,                    <C, A>
-//  A + C                     <1, A> <C, NULL> 
+//  A + C                     <1, A> <C, NULL>
 //  0 +/- 0                   <0, NULL> (corner case)
 //
 // Legend: A and B are not constant, C is constant
-// 
+//
 unsigned FAddend::drillValueDownOneStep
   (Value *Val, FAddend &Addend0, FAddend &Addend1) {
   Instruction *I = 0;
@@ -380,7 +415,7 @@ unsigned FAddend::drillAddendDownOneStep
     return 0;
 
   unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
-  if (!BreakNum || Coeff.isOne()) 
+  if (!BreakNum || Coeff.isOne())
     return BreakNum;
 
   Addend0.Scale(Coeff);
@@ -402,10 +437,10 @@ unsigned FAddend::drillAddendDownOneStep
 Value *FAddCombine::performFactorization(Instruction *I) {
   assert((I->getOpcode() == Instruction::FAdd ||
           I->getOpcode() == Instruction::FSub) && "Expect add/sub");
-  
+
   Instruction *I0 = dyn_cast<Instruction>(I->getOperand(0));
   Instruction *I1 = dyn_cast<Instruction>(I->getOperand(1));
-  
+
   if (!I0 || !I1 || I0->getOpcode() != I1->getOpcode())
     return 0;
 
@@ -420,14 +455,14 @@ Value *FAddCombine::performFactorization(Instruction *I) {
   Value *Opnd1_0 = I1->getOperand(0);
   Value *Opnd1_1 = I1->getOperand(1);
 
-  //  Input Instr I       Factor   AddSub0  AddSub1 
+  //  Input Instr I       Factor   AddSub0  AddSub1
   //  ----------------------------------------------
   // (x*y) +/- (x*z)        x        y         z
   // (y/x) +/- (z/x)        x        y         z
   //
   Value *Factor = 0;
   Value *AddSub0 = 0, *AddSub1 = 0;
-  
+
   if (isMpy) {
     if (Opnd0_0 == Opnd1_0 || Opnd0_0 == Opnd1_1)
       Factor = Opnd0_0;
@@ -459,7 +494,7 @@ Value *FAddCombine::performFactorization(Instruction *I) {
 
   if (isMpy)
     return createFMul(Factor, NewAddSub);
- 
+
   return createFDiv(NewAddSub, Factor);
 }
 
@@ -473,7 +508,7 @@ Value *FAddCombine::simplify(Instruction *I) {
   assert((I->getOpcode() == Instruction::FAdd ||
           I->getOpcode() == Instruction::FSub) && "Expect add/sub");
 
-  // Save the instruction before calling other member-functions. 
+  // Save the instruction before calling other member-functions.
   Instr = I;
 
   FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
@@ -484,7 +519,7 @@ Value *FAddCombine::simplify(Instruction *I) {
   unsigned Opnd0_ExpNum = 0;
   unsigned Opnd1_ExpNum = 0;
 
-  if (!Opnd0.isConstant()) 
+  if (!Opnd0.isConstant())
     Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);
 
   // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
@@ -506,7 +541,7 @@ Value *FAddCombine::simplify(Instruction *I) {
 
     Value *V0 = I->getOperand(0);
     Value *V1 = I->getOperand(1);
-    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&  
+    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&
                  (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
 
     if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
@@ -546,7 +581,7 @@ Value *FAddCombine::simplify(Instruction *I) {
       return R;
   }
 
-  // step 6: Try factorization as the last resort, 
+  // step 6: Try factorization as the last resort,
   return performFactorization(I);
 }
 
@@ -555,7 +590,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
   unsigned AddendNum = Addends.size();
   assert(AddendNum <= 4 && "Too many addends");
 
-  // For saving intermediate results; 
+  // For saving intermediate results;
   unsigned NextTmpIdx = 0;
   FAddend TmpResult[3];
 
@@ -571,7 +606,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
   AddendVect SimpVect;
 
   // The outer loop works on one symbolic-value at a time. Suppose the input
-  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ... 
+  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ...
   // The symbolic-values will be processed in this order: x, y, z.
   //
   for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {
@@ -598,7 +633,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
       if (T && T->getSymVal() == Val) {
         // Set null such that next iteration of the outer loop will not process
         // this addend again.
-        Addends[SameSymIdx] = 0; 
+        Addends[SameSymIdx] = 0;
         SimpVect.push_back(T);
       }
     }
@@ -611,7 +646,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
         R += *SimpVect[Idx];
 
       // Pop all addends being folded and push the resulting folded addend.
-      SimpVect.resize(StartIdx); 
+      SimpVect.resize(StartIdx);
       if (Val != 0) {
         if (!R.isZero()) {
           SimpVect.push_back(&R);
@@ -624,7 +659,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
     }
   }
 
-  assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) && 
+  assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) &&
          "out-of-bound access");
 
   if (ConstAdd)
@@ -646,7 +681,7 @@ Value *FAddCombine::createNaryFAdd
   assert(!Opnds.empty() && "Expect at least one addend");
 
   // Step 1: Check if the # of instructions needed exceeds the quota.
-  // 
+  //
   unsigned InstrNeeded = calcInstrNumber(Opnds);
   if (InstrNeeded > InstrQuota)
     return 0;
@@ -667,7 +702,7 @@ Value *FAddCombine::createNaryFAdd
   // Iterate the addends, creating fadd/fsub using adjacent two addends.
   for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
        I != E; I++) {
-    bool NeedNeg; 
+    bool NeedNeg;
     Value *V = createAddendVal(**I, NeedNeg);
     if (!LastVal) {
       LastVal = V;
@@ -693,7 +728,7 @@ Value *FAddCombine::createNaryFAdd
   }
 
   #ifndef NDEBUG
-    assert(CreateInstrNum == InstrNeeded && 
+    assert(CreateInstrNum == InstrNeeded &&
            "Inconsistent in instruction numbers");
   #endif
 
@@ -751,8 +786,8 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
   unsigned OpndNum = Opnds.size();
   unsigned InstrNeeded = OpndNum - 1;
 
-  // The number of addends in the form of "(-1)*x". 
-  unsigned NegOpndNum = 0; 
+  // The number of addends in the form of "(-1)*x".
+  unsigned NegOpndNum = 0;
 
   // Adjust the number of instructions needed to emit the N-ary add.
   for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 990cbc3..ec75dd2 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -266,9 +266,8 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
   return 0;
 }
 
-
-/// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is
-/// true, otherwise (V < Lo || V >= Hi).  In practice, we emit the more efficient
+/// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
+/// (V < Lo || V >= Hi).  In practice, we emit the more efficient
 /// (V-Lo) \<u Hi-Lo.  This method expects that Lo <= Hi. isSigned indicates
 /// whether to treat the V, Lo and HI as signed or not. IB is the location to
 /// insert new instructions.
@@ -935,6 +934,9 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&
       RHS->getPredicate() == FCmpInst::FCMP_ORD) {
+    if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType())
+      return 0;
+
     // (fcmp ord x, c) & (fcmp ord y, c)  -> (fcmp ord x, y)
     if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
       if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
@@ -1545,14 +1547,6 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     switch (RHSCC) {
     default: llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_EQ:
-      if (LHSCst == SubOne(RHSCst)) {
-        // (X == 13 | X == 14) -> X-13 <u 2
-        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst);
-        return Builder->CreateICmpULT(Add, AddCST);
-      }
-
       if (LHS->getOperand(0) == RHS->getOperand(0)) {
         // if LHSCst and RHSCst differ only by one bit:
         // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1
@@ -1566,6 +1560,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
         }
       }
 
+      if (LHSCst == SubOne(RHSCst)) {
+        // (X == 13 | X == 14) -> X-13 <u 2
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
+        AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst);
+        return Builder->CreateICmpULT(Add, AddCST);
+      }
+
       break;                         // (X == 13 | X == 15) -> no change
     case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
     case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 64cd1bd..78b4a2c 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1372,7 +1372,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttributeSet &NewPAL = AttributeSet::get(FTy->getContext(), NewAttrs);
+      const AttributeSet &NewPAL =
+          AttributeSet::get(FTy->getContext(), NewAttrs);
 
       Instruction *NewCaller;
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index d162223..2ee1278 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1610,6 +1610,9 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double
 /// bitcast.  The various long double bitcasts can't get in here.
 static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
+  // We need to know the target byte order to perform this optimization.
+  if (!IC.getDataLayout()) return 0;
+
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType();
 
@@ -1631,7 +1634,10 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
 
-      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(0));
+      unsigned Elt = 0;
+      if (IC.getDataLayout()->isBigEndian())
+        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1;
+      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
   }
 
@@ -1653,6 +1659,8 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
       }
 
       unsigned Elt = ShAmt->getZExtValue() / DestWidth;
+      if (IC.getDataLayout()->isBigEndian())
+        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt;
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 32fdb9b..4c252c0 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -139,6 +139,31 @@ static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
   }
 }
 
+/// Returns true if the exploded icmp can be expressed as a signed comparison
+/// to zero and updates the predicate accordingly.
+/// The signedness of the comparison is preserved.
+static bool isSignTest(ICmpInst::Predicate &pred, const ConstantInt *RHS) {
+  if (!ICmpInst::isSigned(pred))
+    return false;
+
+  if (RHS->isZero())
+    return ICmpInst::isRelational(pred);
+
+  if (RHS->isOne()) {
+    if (pred == ICmpInst::ICMP_SLT) {
+      pred = ICmpInst::ICMP_SLE;
+      return true;
+    }
+  } else if (RHS->isAllOnesValue()) {
+    if (pred == ICmpInst::ICMP_SGT) {
+      pred = ICmpInst::ICMP_SGE;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 // isHighOnes - Return true if the constant is of the form 1+0+.
 // This is the same as lowones(~X).
 static bool isHighOnes(const ConstantInt *CI) {
@@ -207,7 +232,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   Constant *Init = GV->getInitializer();
   if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
     return 0;
-  
+
   uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
   if (ArrayElementCount > 1024) return 0;  // Don't blow up on huge arrays.
 
@@ -443,20 +468,29 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   }
 
 
-  // If a 32-bit or 64-bit magic bitvector captures the entire comparison state
+  // If a magic bitvector captures the entire comparison state
   // of this load, replace it with computation that does:
   //   ((magic_cst >> i) & 1) != 0
-  if (ArrayElementCount <= 32 ||
-      (TD && ArrayElementCount <= 64 && TD->isLegalInteger(64))) {
-    Type *Ty;
-    if (ArrayElementCount <= 32)
+  {
+    Type *Ty = 0;
+
+    // Look for an appropriate type:
+    // - The type of Idx if the magic fits
+    // - The smallest fitting legal type if we have a DataLayout
+    // - Default to i32
+    if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
+      Ty = Idx->getType();
+    else if (TD)
+      Ty = TD->getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
+    else if (ArrayElementCount <= 32)
       Ty = Type::getInt32Ty(Init->getContext());
-    else
-      Ty = Type::getInt64Ty(Init->getContext());
-    Value *V = Builder->CreateIntCast(Idx, Ty, false);
-    V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
-    V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
-    return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+
+    if (Ty != 0) {
+      Value *V = Builder->CreateIntCast(Idx, Ty, false);
+      V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
+      V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
+      return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+    }
   }
 
   return 0;
@@ -1273,6 +1307,23 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     break;
   }
 
+  case Instruction::Mul: {       // (icmp pred (mul X, Val), CI)
+    ConstantInt *Val = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!Val) break;
+
+    // If this is a signed comparison to 0 and the mul is sign preserving,
+    // use the mul LHS operand instead.
+    ICmpInst::Predicate pred = ICI.getPredicate();
+    if (isSignTest(pred, RHS) && !Val->isZero() &&
+        cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
+      return new ICmpInst(Val->isNegative() ?
+                          ICmpInst::getSwappedPredicate(pred) : pred,
+                          LHSI->getOperand(0),
+                          Constant::getNullValue(RHS->getType()));
+
+    break;
+  }
+
   case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
     ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
     if (!ShAmt) break;
@@ -1304,6 +1355,12 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
                             ConstantExpr::getLShr(RHS, ShAmt));
 
+      // If the shift is NSW and we compare to 0, then it is just shifting out
+      // sign bits, no need for an AND either.
+      if (cast<BinaryOperator>(LHSI)->hasNoSignedWrap() && RHSV == 0)
+        return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                            ConstantExpr::getLShr(RHS, ShAmt));
+
       if (LHSI->hasOneUse()) {
         // Otherwise strength reduce the shift into an and.
         uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
@@ -1318,6 +1375,15 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       }
     }
 
+    // If this is a signed comparison to 0 and the shift is sign preserving,
+    // use the shift LHS operand instead.
+    ICmpInst::Predicate pred = ICI.getPredicate();
+    if (isSignTest(pred, RHS) &&
+        cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
+      return new ICmpInst(pred,
+                          LHSI->getOperand(0),
+                          Constant::getNullValue(RHS->getType()));
+
     // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
     bool TrueIfSigned = false;
     if (LHSI->hasOneUse() &&
@@ -1532,6 +1598,19 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
             return new ICmpInst(pred, X, NegX);
           }
         }
+        break;
+      case Instruction::Mul:
+        if (RHSV == 0 && BO->hasNoSignedWrap()) {
+          if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+            // The trivial case (mul X, 0) is handled by InstSimplify
+            // General case : (mul X, C) != 0 iff X != 0
+            //                (mul X, C) == 0 iff X == 0
+            if (!BOC->isZero())
+              return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                                  Constant::getNullValue(RHS->getType()));
+          }
+        }
+        break;
       default: break;
       }
     } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHSI)) {
@@ -2408,6 +2487,55 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       return new ICmpInst(Pred, Y, Z);
     }
 
+    // icmp slt (X + -1), Y -> icmp sle X, Y
+    if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
+        match(B, m_AllOnes()))
+      return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
+
+    // icmp sge (X + -1), Y -> icmp sgt X, Y
+    if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
+        match(B, m_AllOnes()))
+      return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
+
+    // icmp sle (X + 1), Y -> icmp slt X, Y
+    if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE &&
+        match(B, m_One()))
+      return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
+
+    // icmp sgt (X + 1), Y -> icmp sge X, Y
+    if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT &&
+        match(B, m_One()))
+      return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
+
+    // if C1 has greater magnitude than C2:
+    //  icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
+    //  s.t. C3 = C1 - C2
+    //
+    // if C2 has greater magnitude than C1:
+    //  icmp (X + C1), (Y + C2) -> icmp X, (Y + C3)
+    //  s.t. C3 = C2 - C1
+    if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
+        (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
+      if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
+        if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
+          const APInt &AP1 = C1->getValue();
+          const APInt &AP2 = C2->getValue();
+          if (AP1.isNegative() == AP2.isNegative()) {
+            APInt AP1Abs = C1->getValue().abs();
+            APInt AP2Abs = C2->getValue().abs();
+            if (AP1Abs.uge(AP2Abs)) {
+              ConstantInt *C3 = Builder->getInt(AP1 - AP2);
+              Value *NewAdd = Builder->CreateNSWAdd(A, C3);
+              return new ICmpInst(Pred, NewAdd, C);
+            } else {
+              ConstantInt *C3 = Builder->getInt(AP2 - AP1);
+              Value *NewAdd = Builder->CreateNSWAdd(C, C3);
+              return new ICmpInst(Pred, A, NewAdd);
+            }
+          }
+        }
+
+
     // Analyze the case when either Op0 or Op1 is a sub instruction.
     // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
     A = 0; B = 0; C = 0; D = 0;
@@ -2541,6 +2669,15 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   }
 
   { Value *A, *B;
+    // Transform (A & ~B) == 0 --> (A & B) != 0
+    // and       (A & ~B) != 0 --> (A & B) == 0
+    // if A is a power of 2.
+    if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_Zero()) && isKnownToBeAPowerOfTwo(A) && I.isEquality())
+      return new ICmpInst(I.getInversePredicate(),
+                          Builder->CreateAnd(A, B),
+                          Op1);
+
     // ~x < ~y --> y < x
     // ~x < cst --> ~cst < x
     if (match(Op0, m_Not(m_Value(A)))) {
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 337cfe3..e2d7966 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -69,8 +69,8 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
       // If the GEP has all zero indices, it doesn't offset the pointer.  If it
       // doesn't, it does.
-      if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy, ToDelete,
-                                          IsOffset || !GEP->hasAllZeroIndices()))
+      if (!isOnlyCopiedFromConstantGlobal(
+              GEP, TheCopy, ToDelete, IsOffset || !GEP->hasAllZeroIndices()))
         return false;
       continue;
     }
@@ -166,7 +166,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
   if (AI.isArrayAllocation()) {  // Check C != 1
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
-      Type *NewTy = 
+      Type *NewTy =
         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
       AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
       New->setAlignment(AI.getAlignment());
@@ -294,7 +294,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
 
     Type *SrcPTy = SrcTy->getElementType();
 
-    if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() || 
+    if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() ||
          DestPTy->isVectorTy()) {
       // If the source is an array, the code below will not succeed.  Check to
       // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
@@ -311,7 +311,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
           }
 
       if (IC.getDataLayout() &&
-          (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() || 
+          (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() ||
             SrcPTy->isVectorTy()) &&
           // Do not allow turning this into a load of an integer, which is then
           // casted to a pointer, this pessimizes pointer analysis a lot.
@@ -322,7 +322,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
         // Okay, we are casting from one integer or pointer type to another of
         // the same size.  Instead of casting the pointer before the load, cast
         // the result of the loaded value.
-        LoadInst *NewLoad = 
+        LoadInst *NewLoad =
           IC.Builder->CreateLoad(CastOp, LI.isVolatile(), CI->getName());
         NewLoad->setAlignment(LI.getAlignment());
         NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
@@ -359,7 +359,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   // None of the following transforms are legal for volatile/atomic loads.
   // FIXME: Some of it is okay for atomic loads; needs refactoring.
   if (!LI.isSimple()) return 0;
-  
+
   // Do really simple store-to-load forwarding and load CSE, to catch cases
   // where there are several consecutive memory accesses to the same location,
   // separated by a few arithmetic operations.
@@ -380,7 +380,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
                     Constant::getNullValue(Op->getType()), &LI);
       return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
     }
-  } 
+  }
 
   // load null/undef -> unreachable
   // TODO: Consider a target hook for valid address spaces for this xform.
@@ -399,7 +399,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     if (CE->isCast())
       if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
         return Res;
-  
+
   if (Op->hasOneUse()) {
     // Change select and PHI nodes to select values instead of addresses: this
     // helps alias analysis out a lot, allows many others simplifications, and
@@ -453,18 +453,18 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
   PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
   if (SrcTy == 0) return 0;
-  
+
   Type *SrcPTy = SrcTy->getElementType();
 
   if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
     return 0;
-  
+
   /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
   /// to its first element.  This allows us to handle things like:
   ///   store i32 xxx, (bitcast {foo*, float}* %P to i32*)
   /// on 32-bit hosts.
   SmallVector<Value*, 4> NewGEPIndices;
-  
+
   // If the source is an array, the code below will not succeed.  Check to
   // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
   // constants.
@@ -472,7 +472,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
     // Index through pointer.
     Constant *Zero = Constant::getNullValue(Type::getInt32Ty(SI.getContext()));
     NewGEPIndices.push_back(Zero);
-    
+
     while (1) {
       if (StructType *STy = dyn_cast<StructType>(SrcPTy)) {
         if (!STy->getNumElements()) /* Struct can be empty {} */
@@ -486,24 +486,24 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
         break;
       }
     }
-    
+
     SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace());
   }
 
   if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy())
     return 0;
-  
+
   // If the pointers point into different address spaces or if they point to
   // values with different sizes, we can't do the transformation.
   if (!IC.getDataLayout() ||
-      SrcTy->getAddressSpace() != 
+      SrcTy->getAddressSpace() !=
         cast<PointerType>(CI->getType())->getAddressSpace() ||
       IC.getDataLayout()->getTypeSizeInBits(SrcPTy) !=
       IC.getDataLayout()->getTypeSizeInBits(DestPTy))
     return 0;
 
   // Okay, we are casting from one integer or pointer type to another of
-  // the same size.  Instead of casting the pointer before 
+  // the same size.  Instead of casting the pointer before
   // the store, cast the value to be stored.
   Value *NewCast;
   Value *SIOp0 = SI.getOperand(0);
@@ -517,12 +517,12 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
     if (SIOp0->getType()->isPointerTy())
       opcode = Instruction::PtrToInt;
   }
-  
+
   // SIOp0 is a pointer to aggregate and this is a store to the first field,
   // emit a GEP to index into its first field.
   if (!NewGEPIndices.empty())
     CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices);
-  
+
   NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy,
                                    SIOp0->getName()+".c");
   SI.setOperand(0, NewCast);
@@ -541,7 +541,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
 static bool equivalentAddressValues(Value *A, Value *B) {
   // Test if the values are trivially equivalent.
   if (A == B) return true;
-  
+
   // Test if the values come form identical arithmetic instructions.
   // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
   // its only used to compare two uses within the same basic block, which
@@ -554,7 +554,7 @@ static bool equivalentAddressValues(Value *A, Value *B) {
     if (Instruction *BI = dyn_cast<Instruction>(B))
       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
         return true;
-  
+
   // Otherwise they may not be equivalent.
   return false;
 }
@@ -585,7 +585,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   // If the RHS is an alloca with a single use, zapify the store, making the
   // alloca dead.
   if (Ptr->hasOneUse()) {
-    if (isa<AllocaInst>(Ptr)) 
+    if (isa<AllocaInst>(Ptr))
       return EraseInstFromFunction(SI);
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
       if (isa<AllocaInst>(GEP->getOperand(0))) {
@@ -608,8 +608,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
         (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
       ScanInsts++;
       continue;
-    }    
-    
+    }
+
     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
       // Prev store isn't volatile, and stores to the same location?
       if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1),
@@ -621,7 +621,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       }
       break;
     }
-    
+
     // If this is a load, we have to stop.  However, if the loaded value is from
     // the pointer we're loading and is producing the pointer we're storing,
     // then *this* store is dead (X = load P; store X -> P).
@@ -629,12 +629,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
           LI->isSimple())
         return EraseInstFromFunction(SI);
-      
+
       // Otherwise, this is a load from some other location.  Stores before it
       // may not be dead.
       break;
     }
-    
+
     // Don't skip over loads or things that can modify memory.
     if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
       break;
@@ -664,11 +664,11 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       if (Instruction *Res = InstCombineStoreToCast(*this, SI))
         return Res;
 
-  
+
   // If this store is the last instruction in the basic block (possibly
   // excepting debug info instructions), and if the block ends with an
   // unconditional branch, try to move it to the successor block.
-  BBI = &SI; 
+  BBI = &SI;
   do {
     ++BBI;
   } while (isa<DbgInfoIntrinsic>(BBI) ||
@@ -677,7 +677,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     if (BI->isUnconditional())
       if (SimplifyStoreAtEndOfBlock(SI))
         return 0;  // xform done!
-  
+
   return 0;
 }
 
@@ -691,12 +691,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 ///
 bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   BasicBlock *StoreBB = SI.getParent();
-  
+
   // Check to see if the successor block has exactly two incoming edges.  If
   // so, see if the other predecessor contains a store to the same location.
   // if so, insert a PHI node (if needed) and move the stores down.
   BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
-  
+
   // Determine whether Dest has exactly two predecessors and, if so, compute
   // the other predecessor.
   pred_iterator PI = pred_begin(DestBB);
@@ -708,7 +708,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
 
   if (++PI == pred_end(DestBB))
     return false;
-  
+
   P = *PI;
   if (P != StoreBB) {
     if (OtherBB)
@@ -728,7 +728,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
   if (!OtherBr || BBI == OtherBB->begin())
     return false;
-  
+
   // If the other block ends in an unconditional branch, check for the 'if then
   // else' case.  there is an instruction before the branch.
   StoreInst *OtherStore = 0;
@@ -750,10 +750,10 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   } else {
     // Otherwise, the other block ended with a conditional branch. If one of the
     // destinations is StoreBB, then we have the if/then case.
-    if (OtherBr->getSuccessor(0) != StoreBB && 
+    if (OtherBr->getSuccessor(0) != StoreBB &&
         OtherBr->getSuccessor(1) != StoreBB)
       return false;
-    
+
     // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
     // if/then triangle.  See if there is a store to the same ptr as SI that
     // lives in OtherBB.
@@ -771,7 +771,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
           BBI == OtherBB->begin())
         return false;
     }
-    
+
     // In order to eliminate the store in OtherBr, we have to
     // make sure nothing reads or overwrites the stored value in
     // StoreBB.
@@ -781,7 +781,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
         return false;
     }
   }
-  
+
   // Insert a PHI node now if we need it.
   Value *MergedVal = OtherStore->getOperand(0);
   if (MergedVal != SI.getOperand(0)) {
@@ -790,7 +790,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
     PN->addIncoming(OtherStore->getOperand(0), OtherBB);
     MergedVal = InsertNewInstBefore(PN, DestBB->front());
   }
-  
+
   // Advance to a place where it is safe to insert the new store and
   // insert it.
   BBI = DestBB->getFirstInsertionPt();
@@ -800,7 +800,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
                                    SI.getOrdering(),
                                    SI.getSynchScope());
   InsertNewInstBefore(NewSI, *BBI);
-  NewSI->setDebugLoc(OtherStore->getDebugLoc()); 
+  NewSI->setDebugLoc(OtherStore->getDebugLoc());
 
   // If the two stores had the same TBAA tag, preserve it.
   if (MDNode *TBAATag = SI.getMetadata(LLVMContext::MD_tbaa))
@@ -808,7 +808,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
                                OtherStore->getMetadata(LLVMContext::MD_tbaa))))
       NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
 
-  
+
   // Nuke the old stores.
   EraseInstFromFunction(SI);
   EraseInstFromFunction(*OtherStore);
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 173f2bf..df73906 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -28,7 +28,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   // if this is safe.  For example, the use could be in dynamically unreached
   // code.
   if (!V->hasOneUse()) return 0;
-  
+
   bool MadeChange = false;
 
   // ((1 << A) >>u B) --> (1 << (A-B))
@@ -41,7 +41,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
     A = IC.Builder->CreateSub(A, B);
     return IC.Builder->CreateShl(PowerOf2, A);
   }
-  
+
   // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
   // inexact.  Similarly for <<.
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
@@ -52,12 +52,12 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
         I->setOperand(0, V2);
         MadeChange = true;
       }
-      
+
       if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
         I->setIsExact();
         MadeChange = true;
       }
-      
+
       if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
         I->setHasNoUnsignedWrap();
         MadeChange = true;
@@ -67,7 +67,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   // TODO: Lots more we could do here:
   //    If V is a phi node, we can call this on each of its operands.
   //    "select cond, X, 0" can simplify to "X".
-  
+
   return MadeChange ? V : 0;
 }
 
@@ -84,12 +84,12 @@ static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
     LHSExt = LHSExt.zext(W * 2);
     RHSExt = RHSExt.zext(W * 2);
   }
-  
+
   APInt MulExt = LHSExt * RHSExt;
-  
+
   if (!sign)
     return MulExt.ugt(APInt::getLowBitsSet(W * 2, W));
-  
+
   APInt Min = APInt::getSignedMinValue(W).sext(W * 2);
   APInt Max = APInt::getSignedMaxValue(W).sext(W * 2);
   return MulExt.slt(Min) || MulExt.sgt(Max);
@@ -107,16 +107,16 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
   if (match(Op1, m_AllOnes()))  // X * -1 == 0 - X
     return BinaryOperator::CreateNeg(Op0, I.getName());
-  
+
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
-    
+
     // ((X << C1)*C2) == (X * (C2 << C1))
     if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0))
       if (SI->getOpcode() == Instruction::Shl)
         if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1)))
           return BinaryOperator::CreateMul(SI->getOperand(0),
                                            ConstantExpr::getShl(CI, ShOp));
-    
+
     const APInt &Val = CI->getValue();
     if (Val.isPowerOf2()) {          // Replace X*(2^C) with X << C
       Constant *NewCst = ConstantInt::get(Op0->getType(), Val.logBase2());
@@ -125,7 +125,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap();
       return Shl;
     }
-    
+
     // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
     { Value *X; ConstantInt *C1;
       if (Op0->hasOneUse() &&
@@ -158,9 +158,9 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       }
     }
   }
-  
+
   // Simplify mul instructions with a constant RHS.
-  if (isa<Constant>(Op1)) {    
+  if (isa<Constant>(Op1)) {
     // Try to fold constant mul into select arguments.
     if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
@@ -181,7 +181,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     Value *Op1C = Op1;
     BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0);
     if (!BO ||
-        (BO->getOpcode() != Instruction::UDiv && 
+        (BO->getOpcode() != Instruction::UDiv &&
          BO->getOpcode() != Instruction::SDiv)) {
       Op1C = Op0;
       BO = dyn_cast<BinaryOperator>(Op1);
@@ -227,14 +227,14 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     if (match(Op1, m_Shl(m_One(), m_Value(Y))))
       return BinaryOperator::CreateShl(Op0, Y);
   }
-  
+
   // If one of the operands of the multiply is a cast from a boolean value, then
   // we know the bool is either zero or one, so this is a 'masking' multiply.
   //   X * Y (where Y is 0 or 1) -> X & (0-Y)
   if (!I.getType()->isVectorTy()) {
     // -2 is "-1 << 1" so it is all bits set except the low one.
     APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true);
-    
+
     Value *BoolCast = 0, *OtherOp = 0;
     if (MaskedValueIsZero(Op0, Negative2))
       BoolCast = Op0, OtherOp = Op1;
@@ -280,7 +280,7 @@ static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
      return;
    if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
      return;
-              
+
    ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(0));
    if (CFP && CFP->isExactlyValue(0.5)) {
      Y = I->getOperand(1);
@@ -289,14 +289,14 @@ static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
    CFP = dyn_cast<ConstantFP>(I->getOperand(1));
    if (CFP && CFP->isExactlyValue(0.5))
      Y = I->getOperand(0);
-} 
+}
 
 /// Helper function of InstCombiner::visitFMul(BinaryOperator(). It returns
 /// true iff the given value is FMul or FDiv with one and only one operand
 /// being a normal constant (i.e. not Zero/NaN/Infinity).
 static bool isFMulOrFDivWithConstant(Value *V) {
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I || (I->getOpcode() != Instruction::FMul && 
+  if (!I || (I->getOpcode() != Instruction::FMul &&
              I->getOpcode() != Instruction::FDiv))
     return false;
 
@@ -318,10 +318,10 @@ static bool isNormalFp(const ConstantFP *C) {
 /// foldFMulConst() is a helper routine of InstCombiner::visitFMul().
 /// The input \p FMulOrDiv is a FMul/FDiv with one and only one operand
 /// being a constant (i.e. isFMulOrFDivWithConstant(FMulOrDiv) == true).
-/// This function is to simplify "FMulOrDiv * C" and returns the 
+/// This function is to simplify "FMulOrDiv * C" and returns the
 /// resulting expression. Note that this function could return NULL in
 /// case the constants cannot be folded into a normal floating-point.
-/// 
+///
 Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
                                    Instruction *InsertBefore) {
   assert(isFMulOrFDivWithConstant(FMulOrDiv) && "V is invalid");
@@ -351,7 +351,7 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
       if (isNormalFp(F)) {
         R = BinaryOperator::CreateFMul(Opnd0, F);
       } else {
-        // (X / C1) * C => X / (C1/C) 
+        // (X / C1) * C => X / (C1/C)
         Constant *F = ConstantExpr::getFDiv(C1, C);
         if (isNormalFp(cast<ConstantFP>(F)))
           R = BinaryOperator::CreateFDiv(Opnd0, F);
@@ -415,13 +415,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
         if (C0) {
           std::swap(C0, C1);
           std::swap(Opnd0, Opnd1);
-          Swap = true; 
+          Swap = true;
         }
 
         if (C1 && C1->getValueAPF().isNormal() &&
             isFMulOrFDivWithConstant(Opnd0)) {
           Value *M1 = ConstantExpr::getFMul(C1, C);
-          Value *M0 = isNormalFp(cast<ConstantFP>(M1)) ? 
+          Value *M0 = isNormalFp(cast<ConstantFP>(M1)) ?
                       foldFMulConst(cast<Instruction>(Opnd0), C, &I) :
                       0;
           if (M0 && M1) {
@@ -495,7 +495,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
 
     // (X*Y) * X => (X*X) * Y where Y != X
-    //  The purpose is two-fold: 
+    //  The purpose is two-fold:
     //   1) to form a power expression (of X).
     //   2) potentially shorten the critical path: After transformation, the
     //  latency of the instruction Y is amortized by the expression of X*X,
@@ -537,7 +537,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
 /// instruction.
 bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
   SelectInst *SI = cast<SelectInst>(I.getOperand(1));
-  
+
   // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
   int NonNullOperand = -1;
   if (Constant *ST = dyn_cast<Constant>(SI->getOperand(1)))
@@ -547,36 +547,36 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
   if (Constant *ST = dyn_cast<Constant>(SI->getOperand(2)))
     if (ST->isNullValue())
       NonNullOperand = 1;
-  
+
   if (NonNullOperand == -1)
     return false;
-  
+
   Value *SelectCond = SI->getOperand(0);
-  
+
   // Change the div/rem to use 'Y' instead of the select.
   I.setOperand(1, SI->getOperand(NonNullOperand));
-  
+
   // Okay, we know we replace the operand of the div/rem with 'Y' with no
   // problem.  However, the select, or the condition of the select may have
   // multiple uses.  Based on our knowledge that the operand must be non-zero,
   // propagate the known value for the select into other uses of it, and
   // propagate a known value of the condition into its other users.
-  
+
   // If the select and condition only have a single use, don't bother with this,
   // early exit.
   if (SI->use_empty() && SelectCond->hasOneUse())
     return true;
-  
+
   // Scan the current block backward, looking for other uses of SI.
   BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin();
-  
+
   while (BBI != BBFront) {
     --BBI;
     // If we found a call to a function, we can't assume it will return, so
     // information from below it cannot be propagated above it.
     if (isa<CallInst>(BBI) && !isa<IntrinsicInst>(BBI))
       break;
-    
+
     // Replace uses of the select or its condition with the known values.
     for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
          I != E; ++I) {
@@ -589,17 +589,17 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
         Worklist.Add(BBI);
       }
     }
-    
+
     // If we past the instruction, quit looking for it.
     if (&*BBI == SI)
       SI = 0;
     if (&*BBI == SelectCond)
       SelectCond = 0;
-    
+
     // If we ran out of things to eliminate, break out of the loop.
     if (SelectCond == 0 && SI == 0)
       break;
-    
+
   }
   return true;
 }
@@ -617,7 +617,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
     I.setOperand(1, V);
     return &I;
   }
-  
+
   // Handle cases involving: [su]div X, (select Cond, Y, Z)
   // This does not apply for fdiv.
   if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
@@ -683,16 +683,16 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   // Handle the integer div common cases
   if (Instruction *Common = commonIDivTransforms(I))
     return Common;
-  
-  { 
+
+  {
     // X udiv 2^C -> X >> C
     // Check to see if this is an unsigned division with an exact power of 2,
     // if so, convert to a right shift.
     const APInt *C;
     if (match(Op1, m_Power2(C))) {
       BinaryOperator *LShr =
-      BinaryOperator::CreateLShr(Op0, 
-                                 ConstantInt::get(Op0->getType(), 
+      BinaryOperator::CreateLShr(Op0,
+                                 ConstantInt::get(Op0->getType(),
                                                   C->logBase2()));
       if (I.isExact()) LShr->setIsExact();
       return LShr;
@@ -732,7 +732,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
       return BinaryOperator::CreateLShr(Op0, N);
     }
   }
-  
+
   // udiv X, (Select Cond, C1, C2) --> Select Cond, (shr X, C1), (shr X, C2)
   // where C1&C2 are powers of two.
   { Value *Cond; const APInt *C1, *C2;
@@ -740,11 +740,11 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
       // Construct the "on true" case of the select
       Value *TSI = Builder->CreateLShr(Op0, C1->logBase2(), Op1->getName()+".t",
                                        I.isExact());
-  
+
       // Construct the "on false" case of the select
       Value *FSI = Builder->CreateLShr(Op0, C2->logBase2(), Op1->getName()+".f",
                                        I.isExact());
-      
+
       // construct the select instruction and return it.
       return SelectInst::Create(Cond, TSI, FSI);
     }
@@ -799,7 +799,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
         // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
         return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
       }
-      
+
       if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
         // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
         // Safe because the only negative value (1 << Y) can take on is
@@ -809,13 +809,13 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
       }
     }
   }
-  
+
   return 0;
 }
 
 /// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
 /// FP value and:
-///    1) 1/C is exact, or 
+///    1) 1/C is exact, or
 ///    2) reciprocal is allowed.
 /// If the convertion was successful, the simplified expression "X * 1/C" is
 /// returned; otherwise, NULL is returned.
@@ -826,7 +826,7 @@ static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
   const APFloat &FpVal = Divisor->getValueAPF();
   APFloat Reciprocal(FpVal.getSemantics());
   bool Cvt = FpVal.getExactInverse(&Reciprocal);
-    
+
   if (!Cvt && AllowReciprocal && FpVal.isNormal()) {
     Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
     (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
@@ -870,10 +870,10 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
         Constant *C = ConstantExpr::getFMul(C1, C2);
         const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
         if (F.isNormal() && !F.isDenormal()) {
-          Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C), 
+          Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C),
                                          AllowReciprocal);
           if (!Res)
-            Res = BinaryOperator::CreateFDiv(X, C); 
+            Res = BinaryOperator::CreateFDiv(X, C);
         }
       }
 
@@ -911,7 +911,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
     if (Fold) {
       const APFloat &FoldC = cast<ConstantFP>(Fold)->getValueAPF();
       if (FoldC.isNormal() && !FoldC.isDenormal()) {
-        Instruction *R = CreateDiv ? 
+        Instruction *R = CreateDiv ?
                          BinaryOperator::CreateFDiv(Fold, X) :
                          BinaryOperator::CreateFMul(X, Fold);
         R->setFastMathFlags(I.getFastMathFlags());
@@ -997,7 +997,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
 
   if (Instruction *common = commonIRemTransforms(I))
     return common;
-  
+
   // X urem C^2 -> X and C-1
   { const APInt *C;
     if (match(Op1, m_Power2(C)))
@@ -1005,7 +1005,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
                                        ConstantInt::get(I.getType(), *C-1));
   }
 
-  // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)  
+  // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)
   if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
     Value *Add = Builder->CreateAdd(Op1, N1);
@@ -1041,7 +1041,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   // Handle the integer rem common cases
   if (Instruction *Common = commonIRemTransforms(I))
     return Common;
-  
+
   if (Value *RHSNeg = dyn_castNegVal(Op1))
     if (!isa<Constant>(RHSNeg) ||
         (isa<ConstantInt>(RHSNeg) &&
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index b0a998c..bd14e81 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -27,10 +27,10 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   unsigned Opc = FirstInst->getOpcode();
   Value *LHSVal = FirstInst->getOperand(0);
   Value *RHSVal = FirstInst->getOperand(1);
-    
+
   Type *LHSType = LHSVal->getType();
   Type *RHSType = RHSVal->getType();
-  
+
   bool isNUW = false, isNSW = false, isExact = false;
   if (OverflowingBinaryOperator *BO =
         dyn_cast<OverflowingBinaryOperator>(FirstInst)) {
@@ -39,7 +39,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   } else if (PossiblyExactOperator *PEO =
                dyn_cast<PossiblyExactOperator>(FirstInst))
     isExact = PEO->isExact();
-  
+
   // Scan to see if all operands are the same opcode, and all have one use.
   for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
     Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
@@ -54,14 +54,14 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
     if (CmpInst *CI = dyn_cast<CmpInst>(I))
       if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate())
         return 0;
-    
+
     if (isNUW)
       isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap();
     if (isNSW)
       isNSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     if (isExact)
       isExact = cast<PossiblyExactOperator>(I)->isExact();
-    
+
     // Keep track of which operand needs a phi node.
     if (I->getOperand(0) != LHSVal) LHSVal = 0;
     if (I->getOperand(1) != RHSVal) RHSVal = 0;
@@ -73,9 +73,9 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   // bad when the PHIs are in the header of a loop.
   if (!LHSVal && !RHSVal)
     return 0;
-  
+
   // Otherwise, this is safe to transform!
-  
+
   Value *InLHS = FirstInst->getOperand(0);
   Value *InRHS = FirstInst->getOperand(1);
   PHINode *NewLHS = 0, *NewRHS = 0;
@@ -86,7 +86,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
     InsertNewInstBefore(NewLHS, PN);
     LHSVal = NewLHS;
   }
-  
+
   if (RHSVal == 0) {
     NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(),
                              FirstInst->getOperand(1)->getName() + ".pn");
@@ -94,7 +94,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
     InsertNewInstBefore(NewRHS, PN);
     RHSVal = NewRHS;
   }
-  
+
   // Add all operands to the new PHIs.
   if (NewLHS || NewRHS) {
     for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
@@ -109,7 +109,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
       }
     }
   }
-    
+
   if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
     CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
                                      LHSVal, RHSVal);
@@ -129,8 +129,8 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
 
 Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
-  
-  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), 
+
+  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(),
                                         FirstInst->op_end());
   // This is true if all GEP bases are allocas and if all indices into them are
   // constants.
@@ -140,9 +140,9 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   // more than one phi, which leads to higher register pressure. This is
   // especially bad when the PHIs are in the header of a loop.
   bool NeededPhi = false;
-  
+
   bool AllInBounds = true;
-  
+
   // Scan to see if all operands are the same opcode, and all have one use.
   for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
     GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
@@ -151,18 +151,18 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
       return 0;
 
     AllInBounds &= GEP->isInBounds();
-    
+
     // Keep track of whether or not all GEPs are of alloca pointers.
     if (AllBasePointersAreAllocas &&
         (!isa<AllocaInst>(GEP->getOperand(0)) ||
          !GEP->hasAllConstantIndices()))
       AllBasePointersAreAllocas = false;
-    
+
     // Compare the operand lists.
     for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
       if (FirstInst->getOperand(op) == GEP->getOperand(op))
         continue;
-      
+
       // Don't merge two GEPs when two operands differ (introducing phi nodes)
       // if one of the PHIs has a constant for the index.  The index may be
       // substantially cheaper to compute for the constants, so making it a
@@ -171,7 +171,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
       if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
           isa<ConstantInt>(GEP->getOperand(op)))
         return 0;
-      
+
       if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
         return 0;
 
@@ -186,7 +186,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
       NeededPhi = true;
     }
   }
-  
+
   // If all of the base pointers of the PHI'd GEPs are from allocas, don't
   // bother doing this transformation.  At best, this will just save a bit of
   // offset calculation, but all the predecessors will have to materialize the
@@ -195,11 +195,11 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   // which can usually all be folded into the load.
   if (AllBasePointersAreAllocas)
     return 0;
-  
+
   // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
   // that is variable.
   SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
-  
+
   bool HasAnyPHIs = false;
   for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
     if (FixedOperands[i]) continue;  // operand doesn't need a phi.
@@ -207,28 +207,28 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
     PHINode *NewPN = PHINode::Create(FirstOp->getType(), e,
                                      FirstOp->getName()+".pn");
     InsertNewInstBefore(NewPN, PN);
-    
+
     NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
     OperandPhis[i] = NewPN;
     FixedOperands[i] = NewPN;
     HasAnyPHIs = true;
   }
 
-  
+
   // Add all operands to the new PHIs.
   if (HasAnyPHIs) {
     for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
       GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
       BasicBlock *InBB = PN.getIncomingBlock(i);
-      
+
       for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
         if (PHINode *OpPhi = OperandPhis[op])
           OpPhi->addIncoming(InGEP->getOperand(op), InBB);
     }
   }
-  
+
   Value *Base = FixedOperands[0];
-  GetElementPtrInst *NewGEP = 
+  GetElementPtrInst *NewGEP =
     GetElementPtrInst::Create(Base, makeArrayRef(FixedOperands).slice(1));
   if (AllInBounds) NewGEP->setIsInBounds();
   NewGEP->setDebugLoc(FirstInst->getDebugLoc());
@@ -246,11 +246,11 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
 /// to a register.
 static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
   BasicBlock::iterator BBI = L, E = L->getParent()->end();
-  
+
   for (++BBI; BBI != E; ++BBI)
     if (BBI->mayWriteToMemory())
       return false;
-  
+
   // Check for non-address taken alloca.  If not address-taken already, it isn't
   // profitable to do this xform.
   if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
@@ -266,11 +266,11 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
       isAddressTaken = true;
       break;
     }
-    
+
     if (!isAddressTaken && AI->isStaticAlloca())
       return false;
   }
-  
+
   // If this load is a load from a GEP with a constant offset from an alloca,
   // then we don't want to sink it.  In its present form, it will be
   // load [constant stack offset].  Sinking it will cause us to have to
@@ -280,7 +280,7 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
       if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
         return false;
-  
+
   return true;
 }
 
@@ -300,41 +300,41 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   bool isVolatile = FirstLI->isVolatile();
   unsigned LoadAlignment = FirstLI->getAlignment();
   unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
-  
+
   // We can't sink the load if the loaded value could be modified between the
   // load and the PHI.
   if (FirstLI->getParent() != PN.getIncomingBlock(0) ||
       !isSafeAndProfitableToSinkLoad(FirstLI))
     return 0;
-  
+
   // If the PHI is of volatile loads and the load block has multiple
   // successors, sinking it would remove a load of the volatile value from
   // the path through the other successor.
-  if (isVolatile && 
+  if (isVolatile &&
       FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1)
     return 0;
-  
+
   // Check to see if all arguments are the same operation.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
     if (!LI || !LI->hasOneUse())
       return 0;
-    
-    // We can't sink the load if the loaded value could be modified between 
+
+    // We can't sink the load if the loaded value could be modified between
     // the load and the PHI.
     if (LI->isVolatile() != isVolatile ||
         LI->getParent() != PN.getIncomingBlock(i) ||
         LI->getPointerAddressSpace() != LoadAddrSpace ||
         !isSafeAndProfitableToSinkLoad(LI))
       return 0;
-      
+
     // If some of the loads have an alignment specified but not all of them,
     // we can't do the transformation.
     if ((LoadAlignment != 0) != (LI->getAlignment() != 0))
       return 0;
-    
+
     LoadAlignment = std::min(LoadAlignment, LI->getAlignment());
-    
+
     // If the PHI is of volatile loads and the load block has multiple
     // successors, sinking it would remove a load of the volatile value from
     // the path through the other successor.
@@ -342,16 +342,16 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
         LI->getParent()->getTerminator()->getNumSuccessors() != 1)
       return 0;
   }
-  
+
   // Okay, they are all the same operation.  Create a new PHI node of the
   // correct type, and PHI together all of the LHS's of the instructions.
   PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(),
                                    PN.getNumIncomingValues(),
                                    PN.getName()+".in");
-  
+
   Value *InVal = FirstLI->getOperand(0);
   NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
-  
+
   // Add all operands to the new PHI.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0);
@@ -359,7 +359,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
       InVal = 0;
     NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
   }
-  
+
   Value *PhiVal;
   if (InVal) {
     // The new PHI unions all of the same values together.  This is really
@@ -370,14 +370,14 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
     InsertNewInstBefore(NewPN, PN);
     PhiVal = NewPN;
   }
-  
+
   // If this was a volatile load that we are merging, make sure to loop through
   // and mark all the input loads as non-volatile.  If we don't do this, we will
   // insert a new volatile load and the old ones will not be deletable.
   if (isVolatile)
     for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
       cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false);
-  
+
   LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment);
   NewLI->setDebugLoc(FirstLI->getDebugLoc());
   return NewLI;
@@ -395,7 +395,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     return FoldPHIArgGEPIntoPHI(PN);
   if (isa<LoadInst>(FirstInst))
     return FoldPHIArgLoadIntoPHI(PN);
-  
+
   // Scan the instruction, looking for input operations that can be folded away.
   // If all input operands to the phi are the same instruction (e.g. a cast from
   // the same type or "+42") we can pull the operation through the PHI, reducing
@@ -403,7 +403,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   Constant *ConstantOp = 0;
   Type *CastSrcTy = 0;
   bool isNUW = false, isNSW = false, isExact = false;
-  
+
   if (isa<CastInst>(FirstInst)) {
     CastSrcTy = FirstInst->getOperand(0)->getType();
 
@@ -414,12 +414,12 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
         return 0;
     }
   } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
-    // Can fold binop, compare or shift here if the RHS is a constant, 
+    // Can fold binop, compare or shift here if the RHS is a constant,
     // otherwise call FoldPHIArgBinOpIntoPHI.
     ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
     if (ConstantOp == 0)
       return FoldPHIArgBinOpIntoPHI(PN);
-    
+
     if (OverflowingBinaryOperator *BO =
         dyn_cast<OverflowingBinaryOperator>(FirstInst)) {
       isNUW = BO->hasNoUnsignedWrap();
@@ -442,7 +442,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     } else if (I->getOperand(1) != ConstantOp) {
       return 0;
     }
-    
+
     if (isNUW)
       isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap();
     if (isNSW)
@@ -486,7 +486,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     NewCI->setDebugLoc(FirstInst->getDebugLoc());
     return NewCI;
   }
-  
+
   if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
     BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
     if (isNUW) BinOp->setHasNoUnsignedWrap();
@@ -495,7 +495,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     BinOp->setDebugLoc(FirstInst->getDebugLoc());
     return BinOp;
   }
-  
+
   CmpInst *CIOp = cast<CmpInst>(FirstInst);
   CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
                                    PhiVal, ConstantOp);
@@ -513,7 +513,7 @@ static bool DeadPHICycle(PHINode *PN,
   // Remember this node, and if we find the cycle, return.
   if (!PotentiallyDeadPHIs.insert(PN))
     return true;
-  
+
   // Don't scan crazily complex things.
   if (PotentiallyDeadPHIs.size() == 16)
     return false;
@@ -527,16 +527,16 @@ static bool DeadPHICycle(PHINode *PN,
 /// PHIsEqualValue - Return true if this phi node is always equal to
 /// NonPhiInVal.  This happens with mutually cyclic phi nodes like:
 ///   z = some value; x = phi (y, z); y = phi (x, z)
-static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal, 
+static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
                            SmallPtrSet<PHINode*, 16> &ValueEqualPHIs) {
   // See if we already saw this PHI node.
   if (!ValueEqualPHIs.insert(PN))
     return true;
-  
+
   // Don't scan crazily complex things.
   if (ValueEqualPHIs.size() == 16)
     return false;
- 
+
   // Scan the operands to see if they are either phi nodes or are equal to
   // the value.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
@@ -547,7 +547,7 @@ static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
     } else if (Op != NonPhiInVal)
       return false;
   }
-  
+
   return true;
 }
 
@@ -557,10 +557,10 @@ struct PHIUsageRecord {
   unsigned PHIId;     // The ID # of the PHI (something determinstic to sort on)
   unsigned Shift;     // The amount shifted.
   Instruction *Inst;  // The trunc instruction.
-  
+
   PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User)
     : PHIId(pn), Shift(Sh), Inst(User) {}
-  
+
   bool operator<(const PHIUsageRecord &RHS) const {
     if (PHIId < RHS.PHIId) return true;
     if (PHIId > RHS.PHIId) return false;
@@ -570,15 +570,15 @@ struct PHIUsageRecord {
            RHS.Inst->getType()->getPrimitiveSizeInBits();
   }
 };
-  
+
 struct LoweredPHIRecord {
   PHINode *PN;        // The PHI that was lowered.
   unsigned Shift;     // The amount shifted.
   unsigned Width;     // The width extracted.
-  
+
   LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty)
     : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
-  
+
   // Ctor form used by DenseMap.
   LoweredPHIRecord(PHINode *pn, unsigned Sh)
     : PN(pn), Shift(Sh), Width(0) {}
@@ -621,20 +621,20 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   // PHIUsers - Keep track of all of the truncated values extracted from a set
   // of PHIs, along with their offset.  These are the things we want to rewrite.
   SmallVector<PHIUsageRecord, 16> PHIUsers;
-  
+
   // PHIs are often mutually cyclic, so we keep track of a whole set of PHI
   // nodes which are extracted from. PHIsToSlice is a set we use to avoid
   // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to
   // check the uses of (to ensure they are all extracts).
   SmallVector<PHINode*, 8> PHIsToSlice;
   SmallPtrSet<PHINode*, 8> PHIsInspected;
-  
+
   PHIsToSlice.push_back(&FirstPhi);
   PHIsInspected.insert(&FirstPhi);
-  
+
   for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) {
     PHINode *PN = PHIsToSlice[PHIId];
-    
+
     // Scan the input list of the PHI.  If any input is an invoke, and if the
     // input is defined in the predecessor, then we won't be split the critical
     // edge which is required to insert a truncate.  Because of this, we have to
@@ -644,85 +644,85 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
       if (II == 0) continue;
       if (II->getParent() != PN->getIncomingBlock(i))
         continue;
-     
+
       // If we have a phi, and if it's directly in the predecessor, then we have
       // a critical edge where we need to put the truncate.  Since we can't
       // split the edge in instcombine, we have to bail out.
       return 0;
     }
-      
-    
+
+
     for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end();
          UI != E; ++UI) {
       Instruction *User = cast<Instruction>(*UI);
-      
+
       // If the user is a PHI, inspect its uses recursively.
       if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
         if (PHIsInspected.insert(UserPN))
           PHIsToSlice.push_back(UserPN);
         continue;
       }
-      
+
       // Truncates are always ok.
       if (isa<TruncInst>(User)) {
         PHIUsers.push_back(PHIUsageRecord(PHIId, 0, User));
         continue;
       }
-      
+
       // Otherwise it must be a lshr which can only be used by one trunc.
       if (User->getOpcode() != Instruction::LShr ||
           !User->hasOneUse() || !isa<TruncInst>(User->use_back()) ||
           !isa<ConstantInt>(User->getOperand(1)))
         return 0;
-      
+
       unsigned Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue();
       PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, User->use_back()));
     }
   }
-  
+
   // If we have no users, they must be all self uses, just nuke the PHI.
   if (PHIUsers.empty())
     return ReplaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
-  
+
   // If this phi node is transformable, create new PHIs for all the pieces
   // extracted out of it.  First, sort the users by their offset and size.
   array_pod_sort(PHIUsers.begin(), PHIUsers.end());
-  
+
   DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n';
             for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
               errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n';
         );
-  
+
   // PredValues - This is a temporary used when rewriting PHI nodes.  It is
   // hoisted out here to avoid construction/destruction thrashing.
   DenseMap<BasicBlock*, Value*> PredValues;
-  
+
   // ExtractedVals - Each new PHI we introduce is saved here so we don't
   // introduce redundant PHIs.
   DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals;
-  
+
   for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) {
     unsigned PHIId = PHIUsers[UserI].PHIId;
     PHINode *PN = PHIsToSlice[PHIId];
     unsigned Offset = PHIUsers[UserI].Shift;
     Type *Ty = PHIUsers[UserI].Inst->getType();
-    
+
     PHINode *EltPHI;
-    
+
     // If we've already lowered a user like this, reuse the previously lowered
     // value.
     if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == 0) {
-      
+
       // Otherwise, Create the new PHI node for this user.
       EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(),
                                PN->getName()+".off"+Twine(Offset), PN);
       assert(EltPHI->getType() != PN->getType() &&
              "Truncate didn't shrink phi?");
-    
+
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         BasicBlock *Pred = PN->getIncomingBlock(i);
         Value *&PredVal = PredValues[Pred];
-        
+
         // If we already have a value for this predecessor, reuse it.
         if (PredVal) {
           EltPHI->addIncoming(PredVal, Pred);
@@ -736,7 +736,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
           EltPHI->addIncoming(PredVal, Pred);
           continue;
         }
-        
+
         if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
           // If the incoming value was a PHI, and if it was one of the PHIs we
           // already rewrote it, just use the lowered value.
@@ -746,7 +746,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
             continue;
           }
         }
-        
+
         // Otherwise, do an extract in the predecessor.
         Builder->SetInsertPoint(Pred, Pred->getTerminator());
         Value *Res = InVal;
@@ -756,7 +756,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
         Res = Builder->CreateTrunc(Res, Ty, "extract.t");
         PredVal = Res;
         EltPHI->addIncoming(Res, Pred);
-        
+
         // If the incoming value was a PHI, and if it was one of the PHIs we are
         // rewriting, we will ultimately delete the code we inserted.  This
         // means we need to revisit that PHI to make sure we extract out the
@@ -765,22 +765,22 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
           if (PHIsInspected.count(OldInVal)) {
             unsigned RefPHIId = std::find(PHIsToSlice.begin(),PHIsToSlice.end(),
                                           OldInVal)-PHIsToSlice.begin();
-            PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset, 
+            PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset,
                                               cast<Instruction>(Res)));
             ++UserE;
           }
       }
       PredValues.clear();
-      
+
       DEBUG(errs() << "  Made element PHI for offset " << Offset << ": "
                    << *EltPHI << '\n');
       ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
     }
-    
+
     // Replace the use of this piece with the PHI node.
     ReplaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
   }
-  
+
   // Replace all the remaining uses of the PHI nodes (self uses and the lshrs)
   // with undefs.
   Value *Undef = UndefValue::get(FirstPhi.getType());
@@ -818,7 +818,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
       if (DeadPHICycle(PU, PotentiallyDeadPHIs))
         return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
     }
-   
+
     // If this phi has a single use, and if that use just computes a value for
     // the next iteration of a loop, delete the phi.  This occurs with unused
     // induction variables, e.g. "for (int j = 0; ; ++j);".  Detecting this
@@ -847,7 +847,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
 
     if (InValNo != NumIncomingVals) {
       Value *NonPhiInVal = PN.getIncomingValue(InValNo);
-      
+
       // Scan the rest of the operands to see if there are any conflicts, if so
       // there is no need to recursively scan other phis.
       for (++InValNo; InValNo != NumIncomingVals; ++InValNo) {
@@ -855,7 +855,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
         if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
           break;
       }
-      
+
       // If we scanned over all operands, then we have one unique value plus
       // phi values.  Scan PHI nodes to see if they all merge in each other or
       // the value.
@@ -899,6 +899,6 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
       !TD->isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
     if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
       return Res;
-  
+
   return 0;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index a262d71..2defe63 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -127,13 +127,14 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     // If this is a non-volatile load or a cast from the same type,
     // merge.
     if (TI->isCast()) {
-      if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType())
+      Type *FIOpndTy = FI->getOperand(0)->getType();
+      if (TI->getOperand(0)->getType() != FIOpndTy)
         return 0;
       // The select condition may be a vector. We may only change the operand
       // type if the vector width remains the same (and matches the condition).
       Type *CondTy = SI.getCondition()->getType();
-      if (CondTy->isVectorTy() && CondTy->getVectorNumElements() !=
-          FI->getOperand(0)->getType()->getVectorNumElements())
+      if (CondTy->isVectorTy() && (!FIOpndTy->isVectorTy() ||
+          CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()))
         return 0;
     } else {
       return 0;  // unknown unary op.
@@ -349,6 +350,68 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   return 0;
 }
 
+/// foldSelectICmpAndOr - We want to turn:
+///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
+/// into:
+///   (or (shl (and X, C1), C3), y)
+/// iff:
+///   C1 and C2 are both powers of 2
+/// where:
+///   C3 = Log(C2) - Log(C1)
+///
+/// This transform handles cases where:
+/// 1. The icmp predicate is inverted
+/// 2. The select operands are reversed
+/// 3. The magnitude of C2 and C1 are flipped
+static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
+                                  Value *FalseVal,
+                                  InstCombiner::BuilderTy *Builder) {
+  const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
+  if (!IC || !IC->isEquality())
+    return 0;
+
+  Value *CmpLHS = IC->getOperand(0);
+  Value *CmpRHS = IC->getOperand(1);
+
+  if (!match(CmpRHS, m_Zero()))
+    return 0;
+
+  Value *X;
+  const APInt *C1;
+  if (!match(CmpLHS, m_And(m_Value(X), m_Power2(C1))))
+    return 0;
+
+  const APInt *C2;
+  bool OrOnTrueVal = false;
+  bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2)));
+  if (!OrOnFalseVal)
+    OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2)));
+
+  if (!OrOnFalseVal && !OrOnTrueVal)
+    return 0;
+
+  Value *V = CmpLHS;
+  Value *Y = OrOnFalseVal ? TrueVal : FalseVal;
+
+  unsigned C1Log = C1->logBase2();
+  unsigned C2Log = C2->logBase2();
+  if (C2Log > C1Log) {
+    V = Builder->CreateZExtOrTrunc(V, Y->getType());
+    V = Builder->CreateShl(V, C2Log - C1Log);
+  } else if (C1Log > C2Log) {
+    V = Builder->CreateLShr(V, C1Log - C2Log);
+    V = Builder->CreateZExtOrTrunc(V, Y->getType());
+  } else
+    V = Builder->CreateZExtOrTrunc(V, Y->getType());
+
+  ICmpInst::Predicate Pred = IC->getPredicate();
+  if ((Pred == ICmpInst::ICMP_NE && OrOnFalseVal) ||
+      (Pred == ICmpInst::ICMP_EQ && OrOnTrueVal))
+    V = Builder->CreateXor(V, *C2);
+
+  return Builder->CreateOr(V, Y);
+}
+
 /// visitSelectInstWithICmp - Visit a SelectInst that has an
 /// ICmpInst as its first operand.
 ///
@@ -520,6 +583,9 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
     }
   }
 
+  if (Value *V = foldSelectICmpAndOr(SI, TrueVal, FalseVal, Builder))
+    return ReplaceInstUsesWith(SI, V);
+
   return Changed ? &SI : 0;
 }
 
@@ -675,7 +741,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // Change: A = select B, false, C --> A = and !B, C
       Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
       return BinaryOperator::CreateAnd(NotCond, FalseVal);
-    } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
+    }
+    if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
       if (C->getZExtValue() == false) {
         // Change: A = select B, C, false --> A = and B, C
         return BinaryOperator::CreateAnd(CondVal, TrueVal);
@@ -689,14 +756,14 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     // select a, a, b  -> a|b
     if (CondVal == TrueVal)
       return BinaryOperator::CreateOr(CondVal, FalseVal);
-    else if (CondVal == FalseVal)
+    if (CondVal == FalseVal)
       return BinaryOperator::CreateAnd(CondVal, TrueVal);
 
     // select a, ~a, b -> (~a)&b
     // select a, b, ~a -> (~a)|b
     if (match(TrueVal, m_Not(m_Specific(CondVal))))
       return BinaryOperator::CreateAnd(TrueVal, FalseVal);
-    else if (match(FalseVal, m_Not(m_Specific(CondVal))))
+    if (match(FalseVal, m_Not(m_Specific(CondVal))))
       return BinaryOperator::CreateOr(TrueVal, FalseVal);
   }
 
@@ -837,7 +904,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
             Value *NewFalseOp = NegVal;
             if (AddOp != TI)
               std::swap(NewTrueOp, NewFalseOp);
-            Value *NewSel = 
+            Value *NewSel =
               Builder->CreateSelect(CondVal, NewTrueOp,
                                     NewFalseOp, SI.getName() + ".p");
 
@@ -861,7 +928,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     Value *LHS, *RHS, *LHS2, *RHS2;
     if (SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS)) {
       if (SelectPatternFlavor SPF2 = MatchSelectPattern(LHS, LHS2, RHS2))
-        if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, 
+        if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,
                                           SI, SPF, RHS))
           return R;
       if (SelectPatternFlavor SPF2 = MatchSelectPattern(RHS, LHS2, RHS2))
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 4f71db1..de8a3ac 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -105,6 +105,75 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
   return 0;
 }
 
+// If we have a PHI node with a vector type that has only 2 uses: feed
+// itself and be an operand of extractelemnt at a constant location,
+// try to replace the PHI of the vector type with a PHI of a scalar type
+Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
+  // Verify that the PHI node has exactly 2 uses. Otherwise return NULL.
+  if (!PN->hasNUses(2))
+    return NULL;
+
+  // If so, it's known at this point that one operand is PHI and the other is
+  // an extractelement node. Find the PHI user that is not the extractelement
+  // node.
+  Value::use_iterator iu = PN->use_begin();
+  Instruction *PHIUser = dyn_cast<Instruction>(*iu);
+  if (PHIUser == cast<Instruction>(&EI))
+    PHIUser = cast<Instruction>(*(++iu));
+
+  // Verify that this PHI user has one use, which is the PHI itself,
+  // and that it is a binary operation which is cheap to scalarize.
+  // otherwise return NULL.
+  if (!PHIUser->hasOneUse() || !(PHIUser->use_back() == PN) ||
+    !(isa<BinaryOperator>(PHIUser)) ||
+    !CheapToScalarize(PHIUser, true))
+    return NULL;
+
+  // Create a scalar PHI node that will replace the vector PHI node
+  // just before the current PHI node.
+  PHINode * scalarPHI = cast<PHINode>(
+    InsertNewInstWith(PHINode::Create(EI.getType(),
+    PN->getNumIncomingValues(), ""), *PN));
+  // Scalarize each PHI operand.
+  for (unsigned i=0; i < PN->getNumIncomingValues(); i++) {
+    Value *PHIInVal = PN->getIncomingValue(i);
+    BasicBlock *inBB = PN->getIncomingBlock(i);
+    Value *Elt = EI.getIndexOperand();
+    // If the operand is the PHI induction variable:
+    if (PHIInVal == PHIUser) {
+      // Scalarize the binary operation. Its first operand is the
+      // scalar PHI and the second operand is extracted from the other
+      // vector operand.
+      BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
+      unsigned opId = (B0->getOperand(0) == PN) ? 1: 0;
+      Value *Op = Builder->CreateExtractElement(
+        B0->getOperand(opId), Elt, B0->getOperand(opId)->getName()+".Elt");
+      Value *newPHIUser = InsertNewInstWith(
+        BinaryOperator::Create(B0->getOpcode(), scalarPHI,Op),
+        *B0);
+      scalarPHI->addIncoming(newPHIUser, inBB);
+    } else {
+      // Scalarize PHI input:
+      Instruction *newEI =
+        ExtractElementInst::Create(PHIInVal, Elt, "");
+      // Insert the new instruction into the predecessor basic block.
+      Instruction *pos = dyn_cast<Instruction>(PHIInVal);
+      BasicBlock::iterator InsertPos;
+      if (pos && !isa<PHINode>(pos)) {
+        InsertPos = pos;
+        ++InsertPos;
+      } else {
+        InsertPos = inBB->getFirstInsertionPt();
+      }
+
+      InsertNewInstWith(newEI, *InsertPos);
+
+      scalarPHI->addIncoming(newEI, inBB);
+    }
+  }
+  return ReplaceInstUsesWith(EI, scalarPHI);
+}
+
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
   // If vector val is constant with all elements the same, replace EI with
   // that element.  We handle a known element # below.
@@ -149,6 +218,14 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
           if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
             return new BitCastInst(Elt, EI.getType());
     }
+
+    // If there's a vector PHI feeding a scalar use through this extractelement
+    // instruction, try to scalarize the PHI.
+    if (PHINode *PN = dyn_cast<PHINode>(EI.getOperand(0))) {
+    	Instruction *scalarPHI = scalarizePHI(EI, PN);
+    	if (scalarPHI)
+    		return (scalarPHI);
+    }
   }
 
   if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
@@ -201,10 +278,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
     } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
       // Canonicalize extractelement(cast) -> cast(extractelement)
       // bitcasts can change the number of vector elements and they cost nothing
-      if (CI->hasOneUse() && EI.hasOneUse() &&
-          (CI->getOpcode() != Instruction::BitCast)) {
+      if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
         Value *EE = Builder->CreateExtractElement(CI->getOperand(0),
                                                   EI.getIndexOperand());
+        Worklist.AddValue(EE);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }
     }
@@ -336,6 +413,10 @@ static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask,
 
         if (VecOp == RHS) {
           Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS);
+          // Update Mask to reflect that `ScalarOp' has been inserted at
+          // position `InsertedIdx' within the vector returned by IEI.
+          Mask[InsertedIdx % NumElts] = Mask[ExtractedIdx];
+
           // Everything but the extracted element is replaced with the RHS.
           for (unsigned i = 0; i != NumElts; ++i) {
             if (i != InsertedIdx)
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 92b42ee..623c470 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -71,7 +71,7 @@ static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals";
 static const char *kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *kAsanInitName = "__asan_init_v2";
+static const char *kAsanInitName = "__asan_init_v3";
 static const char *kAsanHandleNoReturnName = "__asan_handle_no_return";
 static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
 static const char *kAsanMappingScaleName = "__asan_mapping_scale";
@@ -274,8 +274,6 @@ struct AddressSanitizer : public FunctionPass {
                                    Instruction *InsertBefore, bool IsWrite);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
   bool runOnFunction(Function &F);
-  void createInitializerPoisonCalls(Module &M,
-                                    Value *FirstAddr, Value *LastAddr);
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
   void emitShadowMapping(Module &M, IRBuilder<> &IRB) const;
   virtual bool doInitialization(Module &M);
@@ -333,8 +331,7 @@ class AddressSanitizerModule : public ModulePass {
   void initializeCallbacks(Module &M);
 
   bool ShouldInstrumentGlobal(GlobalVariable *G);
-  void createInitializerPoisonCalls(Module &M, Value *FirstAddr,
-                                    Value *LastAddr);
+  void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
   size_t RedzoneSize() const {
     return RedzoneSizeForScale(Mapping.Scale);
   }
@@ -753,7 +750,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 }
 
 void AddressSanitizerModule::createInitializerPoisonCalls(
-    Module &M, Value *FirstAddr, Value *LastAddr) {
+    Module &M, GlobalValue *ModuleName) {
   // We do all of our poisoning and unpoisoning within _GLOBAL__I_a.
   Function *GlobalInit = M.getFunction("_GLOBAL__I_a");
   // If that function is not present, this TU contains no globals, or they have
@@ -765,7 +762,8 @@ void AddressSanitizerModule::createInitializerPoisonCalls(
   IRBuilder<> IRB(GlobalInit->begin()->getFirstInsertionPt());
 
   // Add a call to poison all external globals before the given function starts.
-  IRB.CreateCall2(AsanPoisonGlobals, FirstAddr, LastAddr);
+  Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
+  IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
 
   // Add calls to unpoison all globals before each return instruction.
   for (Function::iterator I = GlobalInit->begin(), E = GlobalInit->end();
@@ -839,7 +837,7 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
   // Declare our poisoning and unpoisoning functions.
   AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, NULL));
   AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanUnpoisonGlobalsName, IRB.getVoidTy(), NULL));
@@ -901,12 +899,13 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   assert(CtorFunc);
   IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
 
-  // The addresses of the first and last dynamically initialized globals in
-  // this TU.  Used in initialization order checking.
-  Value *FirstDynamic = 0, *LastDynamic = 0;
+  bool HasDynamicallyInitializedGlobals = false;
 
   GlobalVariable *ModuleName = createPrivateGlobalForString(
       M, M.getModuleIdentifier());
+  // We shouldn't merge same module names, as this string serves as unique
+  // module ID in runtime.
+  ModuleName->setUnnamedAddr(false);
 
   for (size_t i = 0; i < n; i++) {
     static const uint64_t kMaxGlobalRedzone = 1 << 18;
@@ -966,11 +965,8 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
         NULL);
 
     // Populate the first and last globals declared in this TU.
-    if (CheckInitOrder && GlobalHasDynamicInitializer) {
-      LastDynamic = ConstantExpr::getPointerCast(NewGlobal, IntptrTy);
-      if (FirstDynamic == 0)
-        FirstDynamic = LastDynamic;
-    }
+    if (CheckInitOrder && GlobalHasDynamicInitializer)
+      HasDynamicallyInitializedGlobals = true;
 
     DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
   }
@@ -981,8 +977,8 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
   // Create calls for poisoning before initializers run and unpoisoning after.
-  if (CheckInitOrder && FirstDynamic && LastDynamic)
-    createInitializerPoisonCalls(M, FirstDynamic, LastDynamic);
+  if (CheckInitOrder && HasDynamicallyInitializedGlobals)
+    createInitializerPoisonCalls(M, ModuleName);
   IRB.CreateCall2(AsanRegisterGlobals,
                   IRB.CreatePointerCast(AllGlobals, IntptrTy),
                   ConstantInt::get(IntptrTy, n));
@@ -1317,10 +1313,10 @@ void FunctionStackPoisoner::poisonStack() {
         ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
   }
 
-  // This string will be parsed by the run-time (DescribeStackAddress).
+  // This string will be parsed by the run-time (DescribeAddressIfStack).
   SmallString<2048> StackDescriptionStorage;
   raw_svector_ostream StackDescription(StackDescriptionStorage);
-  StackDescription << F.getName() << " " << AllocaVec.size() << " ";
+  StackDescription << AllocaVec.size() << " ";
 
   // Insert poison calls for lifetime intrinsics for alloca.
   bool HavePoisonedAllocas = false;
@@ -1353,19 +1349,26 @@ void FunctionStackPoisoner::poisonStack() {
   }
   assert(Pos == LocalStackSize);
 
-  // Write the Magic value and the frame description constant to the redzone.
+  // The left-most redzone has enough space for at least 4 pointers.
+  // Write the Magic value to redzone[0].
   Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
   IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
                   BasePlus0);
-  Value *BasePlus1 = IRB.CreateAdd(LocalStackBase,
-                                   ConstantInt::get(IntptrTy,
-                                                    ASan.LongSize/8));
-  BasePlus1 = IRB.CreateIntToPtr(BasePlus1, IntptrPtrTy);
+  // Write the frame description constant to redzone[1].
+  Value *BasePlus1 = IRB.CreateIntToPtr(
+    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize/8)),
+    IntptrPtrTy);
   GlobalVariable *StackDescriptionGlobal =
       createPrivateGlobalForString(*F.getParent(), StackDescription.str());
   Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal,
                                              IntptrTy);
   IRB.CreateStore(Description, BasePlus1);
+  // Write the PC to redzone[2].
+  Value *BasePlus2 = IRB.CreateIntToPtr(
+    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy,
+                                                   2 * ASan.LongSize/8)),
+    IntptrPtrTy);
+  IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
 
   // Poison the stack redzones at the entry.
   Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
index 927982d..39de4b0 100644
--- a/lib/Transforms/Instrumentation/BlackList.cpp
+++ b/lib/Transforms/Instrumentation/BlackList.cpp
@@ -110,7 +110,8 @@ static StringRef GetGVTypeString(const GlobalVariable &G) {
 bool BlackList::isInInit(const GlobalVariable &G) const {
   return (isIn(*G.getParent()) ||
           inSection("global-init", G.getName()) ||
-          inSection("global-init-type", GetGVTypeString(G)));
+          inSection("global-init-type", GetGVTypeString(G)) ||
+          inSection("global-init-src", G.getParent()->getModuleIdentifier()));
 }
 
 bool BlackList::inSection(const StringRef Section,
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index e8d4ac8..2edd151 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Support/PathV2.h"
 #include "llvm/Support/raw_ostream.h"
@@ -101,6 +102,7 @@ namespace {
     Constant *getIncrementIndirectCounterFunc();
     Constant *getEmitFunctionFunc();
     Constant *getEmitArcsFunc();
+    Constant *getDeleteWriteoutFunctionListFunc();
     Constant *getDeleteFlushFunctionListFunc();
     Constant *getEndFileFunc();
 
@@ -142,6 +144,12 @@ ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
   return new GCOVProfiler(Options);
 }
 
+static std::string getFunctionName(DISubprogram SP) {
+  if (!SP.getLinkageName().empty())
+    return SP.getLinkageName();
+  return SP.getName();
+}
+
 namespace {
   class GCOVRecord {
    protected:
@@ -290,7 +298,7 @@ namespace {
       ReturnBlock = new GCOVBlock(i++, os);
 
       writeBytes(FunctionTag, 4);
-      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(SP.getName()) +
+      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) +
           1 + lengthOfGCOVString(SP.getFilename()) + 1;
       if (UseCfgChecksum)
         ++BlockLen;
@@ -299,7 +307,7 @@ namespace {
       write(0);  // lineno checksum
       if (UseCfgChecksum)
         write(0);  // cfg checksum
-      writeGCOVString(SP.getName());
+      writeGCOVString(getFunctionName(SP));
       writeGCOVString(SP.getFilename());
       write(SP.getLineNumber());
     }
@@ -374,7 +382,11 @@ std::string GCOVProfiler::mangleName(DICompileUnit CU, const char *NewStem) {
 
   SmallString<128> Filename = CU.getFilename();
   sys::path::replace_extension(Filename, NewStem);
-  return sys::path::filename(Filename.str());
+  StringRef FName = sys::path::filename(Filename);
+  SmallString<128> CurPath;
+  if (sys::fs::current_path(CurPath)) return FName;
+  sys::path::append(CurPath, FName.str());
+  return CurPath.str();
 }
 
 bool GCOVProfiler::runOnModule(Module &M) {
@@ -544,8 +556,8 @@ bool GCOVProfiler::emitProfileArcs() {
     Function *FlushF = insertFlush(CountersBySP);
 
     // Create a small bit of code that registers the "__llvm_gcov_writeout" to
-    //  be executed at exit and the "__llvm_gcov_flush" function to be executed
-    //  when "__gcov_flush" is called.
+    // be executed at exit and the "__llvm_gcov_flush" function to be executed
+    // when "__gcov_flush" is called.
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
     Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
                                    "__llvm_gcov_init", M);
@@ -558,21 +570,17 @@ bool GCOVProfiler::emitProfileArcs() {
     BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
     IRBuilder<> Builder(BB);
 
-    FTy = FunctionType::get(Builder.getInt32Ty(),
-                            PointerType::get(FTy, 0), false);
-    Constant *AtExitFn = M->getOrInsertFunction("atexit", FTy);
-    Builder.CreateCall(AtExitFn, WriteoutF);
-
-    // Register the local flush function.
     FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-    FTy = FunctionType::get(Builder.getVoidTy(),
-                            PointerType::get(FTy, 0), false);
-    Constant *RegFlush =
-      M->getOrInsertFunction("llvm_register_flush_function", FTy);
-    Builder.CreateCall(RegFlush, FlushF);
-
-    // Make sure that all the flush function list is deleted.
-    Builder.CreateCall(AtExitFn, getDeleteFlushFunctionListFunc());
+    Type *Params[] = {
+      PointerType::get(FTy, 0),
+      PointerType::get(FTy, 0)
+    };
+    FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
+
+    // Inialize the environment and register the local writeout and flush
+    // functions.
+    Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
+    Builder.CreateCall2(GCOVInit, WriteoutF, FlushF);
     Builder.CreateRetVoid();
 
     appendToGlobalCtors(*M, F, 0);
@@ -671,6 +679,11 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
   return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
 }
 
+Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy);
+}
+
 Constant *GCOVProfiler::getDeleteFlushFunctionListFunc() {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_delete_flush_function_list", FTy);
@@ -724,12 +737,12 @@ Function *GCOVProfiler::insertCounterWriteout(
                           Builder.CreateGlobalStringPtr(ReversedVersion));
       for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) {
         DISubprogram SP(CountersBySP[j].second);
-        Builder.CreateCall3(EmitFunction,
-                            Builder.getInt32(j),
-                            Options.FunctionNamesInData ?
-                              Builder.CreateGlobalStringPtr(SP.getName()) :
-                              Constant::getNullValue(Builder.getInt8PtrTy()),
-                            Builder.getInt8(Options.UseCfgChecksum));
+        Builder.CreateCall3(
+            EmitFunction, Builder.getInt32(j),
+            Options.FunctionNamesInData ?
+              Builder.CreateGlobalStringPtr(getFunctionName(SP)) :
+              Constant::getNullValue(Builder.getInt8PtrTy()),
+            Builder.getInt8(Options.UseCfgChecksum));
 
         GlobalVariable *GV = CountersBySP[j].first;
         unsigned Arcs =
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 8ba1025..9f35396 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
 #include "llvm-c/Initialization.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index fce6513..4e75904 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -122,6 +122,9 @@ static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
 static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
        cl::desc("poison uninitialized stack variables with the given patter"),
        cl::Hidden, cl::init(0xff));
+static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
+       cl::desc("poison undef temps"),
+       cl::Hidden, cl::init(true));
 
 static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
        cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
@@ -690,7 +693,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// Clean shadow (all zeroes) means all bits of the value are defined
   /// (initialized).
-  Value *getCleanShadow(Value *V) {
+  Constant *getCleanShadow(Value *V) {
     Type *ShadowTy = getShadowTy(V);
     if (!ShadowTy)
       return 0;
@@ -709,6 +712,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return ConstantStruct::get(ST, Vals);
   }
 
+  /// \brief Create a dirty shadow for a given value.
+  Constant *getPoisonedShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return 0;
+    return getPoisonedShadow(ShadowTy);
+  }
+
   /// \brief Create a clean (zero) origin.
   Value *getCleanOrigin() {
     return Constant::getNullValue(MS.OriginTy);
@@ -730,7 +741,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return Shadow;
     }
     if (UndefValue *U = dyn_cast<UndefValue>(V)) {
-      Value *AllOnes = getPoisonedShadow(getShadowTy(V));
+      Value *AllOnes = ClPoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
       DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
       (void)U;
       return AllOnes;
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index f93c5ab..299060a 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -56,6 +57,9 @@ static cl::opt<bool>  ClInstrumentFuncEntryExit(
 static cl::opt<bool>  ClInstrumentAtomics(
     "tsan-instrument-atomics", cl::init(true),
     cl::desc("Instrument atomics"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemIntrinsics(
+    "tsan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
@@ -63,6 +67,7 @@ STATISTIC(NumOmittedReadsBeforeWrite,
           "Number of reads ignored due to following writes");
 STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
 STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
 STATISTIC(NumOmittedReadsFromConstantGlobals,
           "Number of reads from constant globals");
 STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
@@ -85,12 +90,14 @@ struct ThreadSanitizer : public FunctionPass {
   void initializeCallbacks(Module &M);
   bool instrumentLoadOrStore(Instruction *I);
   bool instrumentAtomic(Instruction *I);
+  bool instrumentMemIntrinsic(Instruction *I);
   void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
                                       SmallVectorImpl<Instruction*> &All);
   bool addrPointsToConstantData(Value *Addr);
   int getMemoryAccessFuncIndex(Value *Addr);
 
   DataLayout *TD;
+  Type *IntptrTy;
   SmallString<64> BlacklistFile;
   OwningPtr<BlackList> BL;
   IntegerType *OrdTy;
@@ -108,6 +115,8 @@ struct ThreadSanitizer : public FunctionPass {
   Function *TsanAtomicThreadFence;
   Function *TsanAtomicSignalFence;
   Function *TsanVptrUpdate;
+  Function *TsanVptrLoad;
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
 };
 }  // namespace
 
@@ -196,10 +205,22 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
   TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
       IRB.getInt8PtrTy(), NULL));
+  TsanVptrLoad = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_vptr_read", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
   TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL));
   TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL));
+
+  MemmoveFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL));
+  MemcpyFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy, NULL));
+  MemsetFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy, NULL));
 }
 
 bool ThreadSanitizer::doInitialization(Module &M) {
@@ -210,6 +231,7 @@ bool ThreadSanitizer::doInitialization(Module &M) {
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
+  IntptrTy = IRB.getIntPtrTy(TD);
   Value *TsanInit = M.getOrInsertFunction("__tsan_init",
                                           IRB.getVoidTy(), NULL);
   appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
@@ -309,6 +331,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   SmallVector<Instruction*, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
   SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
   bool Res = false;
   bool HasCalls = false;
 
@@ -325,6 +348,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
       else if (isa<ReturnInst>(BI))
         RetVec.push_back(BI);
       else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
+        if (isa<MemIntrinsic>(BI))
+          MemIntrinCalls.push_back(BI);
         HasCalls = true;
         chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
       }
@@ -348,6 +373,11 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
       Res |= instrumentAtomic(AtomicAccesses[i]);
     }
 
+  if (ClInstrumentMemIntrinsics)
+    for (size_t i = 0, n = MemIntrinCalls.size(); i < n; ++i) {
+      Res |= instrumentMemIntrinsic(MemIntrinCalls[i]);
+    }
+
   // Instrument function entry/exit points if there were instrumented accesses.
   if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
     IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
@@ -386,6 +416,12 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
     NumInstrumentedVtableWrites++;
     return true;
   }
+  if (!IsWrite && isVtableAccess(I)) {
+    IRB.CreateCall(TsanVptrLoad,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+    NumInstrumentedVtableReads++;
+    return true;
+  }
   Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
   IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
   if (IsWrite) NumInstrumentedWrites++;
@@ -423,6 +459,32 @@ static ConstantInt *createFailOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
   return IRB->getInt32(v);
 }
 
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since tsan is running after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __tsan_memset to avoid the intrinsics.
+bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    IRB.CreateCall3(MemsetFn,
+      IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+      IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false));
+    I->eraseFromParent();
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    IRB.CreateCall3(isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+      IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false));
+    I->eraseFromParent();
+  }
+  return false;
+}
+
 // Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
 // standards.  For background see C++11 standard.  A slightly older, publically
 // available draft of the standard (not entirely up-to-date, but close enough
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 5aada9c..8f917ae 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -38,6 +38,7 @@ llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
   switch (Class) {
   case IC_Autorelease:
   case IC_AutoreleaseRV:
+  case IC_IntrinsicUser:
   case IC_User:
     // These operations never directly modify a reference count.
     return false;
diff --git a/lib/Transforms/ObjCARC/ObjCARC.cpp b/lib/Transforms/ObjCARC/ObjCARC.cpp
index 53a31b0..373168e 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -30,6 +30,7 @@ using namespace llvm::objcarc;
 bool llvm::objcarc::EnableARCOpts;
 static cl::opt<bool, true>
 EnableARCOptimizations("enable-objc-arc-opts",
+                       cl::desc("enable/disable all ARC Optimizations"),
                        cl::location(EnableARCOpts),
                        cl::init(true));
 
diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
index e062b66..39670f3 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/lib/Transforms/ObjCARC/ObjCARC.h
@@ -64,7 +64,8 @@ static inline bool ModuleHasARC(const Module &M) {
     M.getNamedValue("objc_copyWeak") ||
     M.getNamedValue("objc_retainedObject") ||
     M.getNamedValue("objc_unretainedObject") ||
-    M.getNamedValue("objc_unretainedPointer");
+    M.getNamedValue("objc_unretainedPointer") ||
+    M.getNamedValue("clang.arc.use");
 }
 
 /// \enum InstructionClass
@@ -89,6 +90,7 @@ enum InstructionClass {
   IC_CopyWeak,            ///< objc_copyWeak (derived)
   IC_DestroyWeak,         ///< objc_destroyWeak (derived)
   IC_StoreStrong,         ///< objc_storeStrong (derived)
+  IC_IntrinsicUser,       ///< clang.arc.use
   IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
   IC_Call,                ///< could call objc_release
   IC_User,                ///< could "use" a pointer
@@ -97,6 +99,13 @@ enum InstructionClass {
 
 raw_ostream &operator<<(raw_ostream &OS, const InstructionClass Class);
 
+/// \brief Test if the given class is a kind of user.
+inline static bool IsUser(InstructionClass Class) {
+  return Class == IC_User ||
+         Class == IC_CallOrUser ||
+         Class == IC_IntrinsicUser;
+}
+
 /// \brief Test if the given class is objc_retain or equivalent.
 static inline bool IsRetain(InstructionClass Class) {
   return Class == IC_Retain ||
@@ -112,13 +121,10 @@ static inline bool IsAutorelease(InstructionClass Class) {
 /// \brief Test if the given class represents instructions which return their
 /// argument verbatim.
 static inline bool IsForwarding(InstructionClass Class) {
-  // objc_retainBlock technically doesn't always return its argument
-  // verbatim, but it doesn't matter for our purposes here.
   return Class == IC_Retain ||
          Class == IC_RetainRV ||
          Class == IC_Autorelease ||
          Class == IC_AutoreleaseRV ||
-         Class == IC_RetainBlock ||
          Class == IC_NoopCast;
 }
 
@@ -256,11 +262,11 @@ static inline Value *GetObjCArg(Value *Inst) {
   return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
 }
 
-static inline bool isNullOrUndef(const Value *V) {
+static inline bool IsNullOrUndef(const Value *V) {
   return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
 }
 
-static inline bool isNoopInstruction(const Instruction *I) {
+static inline bool IsNoopInstruction(const Instruction *I) {
   return isa<BitCastInst>(I) ||
     (isa<GetElementPtrInst>(I) &&
      cast<GetElementPtrInst>(I)->hasAllZeroIndices());
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 1c13d1c..c43f4f4 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -66,6 +66,8 @@ namespace {
     Constant *RetainAutoreleaseCallee;
     /// Declaration for objc_retainAutoreleaseReturnValue().
     Constant *RetainAutoreleaseRVCallee;
+    /// Declaration for objc_retainAutoreleasedReturnValue().
+    Constant *RetainRVCallee;
 
     /// The inline asm string to insert between calls and RetainRV calls to make
     /// the optimization work on targets which need it.
@@ -77,9 +79,12 @@ namespace {
     SmallPtrSet<CallInst *, 8> StoreStrongCalls;
 
     Constant *getStoreStrongCallee(Module *M);
+    Constant *getRetainRVCallee(Module *M);
     Constant *getRetainAutoreleaseCallee(Module *M);
     Constant *getRetainAutoreleaseRVCallee(Module *M);
 
+    bool OptimizeRetainCall(Function &F, Instruction *Retain);
+
     bool ContractAutorelease(Function &F, Instruction *Autorelease,
                              InstructionClass Class,
                              SmallPtrSet<Instruction *, 4>
@@ -172,6 +177,57 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
   return RetainAutoreleaseRVCallee;
 }
 
+Constant *ObjCARCContract::getRetainRVCallee(Module *M) {
+  if (!RetainRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainRVCallee;
+}
+
+/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
+/// return value. We do this late so we do not disrupt the dataflow analysis in
+/// ObjCARCOpt.
+bool
+ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) {
+  ImmutableCallSite CS(GetObjCArg(Retain));
+  const Instruction *Call = CS.getInstruction();
+  if (!Call)
+    return false;
+  if (Call->getParent() != Retain->getParent())
+    return false;
+
+  // Check that the call is next to the retain.
+  BasicBlock::const_iterator I = Call;
+  ++I;
+  while (IsNoopInstruction(I)) ++I;
+  if (&*I != Retain)
+    return false;
+
+  // Turn it to an objc_retainAutoreleasedReturnValue.
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "Transforming objc_retain => "
+                  "objc_retainAutoreleasedReturnValue since the operand is a "
+                  "return value.\nOld: "<< *Retain << "\n");
+
+  // We do not have to worry about tail calls/does not throw since
+  // retain/retainRV have the same properties.
+  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
+
+  DEBUG(dbgs() << "New: " << *Retain << "\n");
+  return true;
+}
+
 /// Merge an autorelease with a retain into a fused call.
 bool
 ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
@@ -329,6 +385,7 @@ bool ObjCARCContract::doInitialization(Module &M) {
   StoreStrongCallee = 0;
   RetainAutoreleaseCallee = 0;
   RetainAutoreleaseRVCallee = 0;
+  RetainRVCallee = 0;
 
   // Initialize RetainRVMarker.
   RetainRVMarker = 0;
@@ -380,7 +437,6 @@ bool ObjCARCContract::runOnFunction(Function &F) {
     // objc_retainBlock does not necessarily return its argument.
     InstructionClass Class = GetBasicInstructionClass(Inst);
     switch (Class) {
-    case IC_Retain:
     case IC_FusedRetainAutorelease:
     case IC_FusedRetainAutoreleaseRV:
       break;
@@ -389,6 +445,13 @@ bool ObjCARCContract::runOnFunction(Function &F) {
       if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
         continue;
       break;
+    case IC_Retain:
+      // Attempt to convert retains to retainrvs if they are next to function
+      // calls.
+      if (!OptimizeRetainCall(F, Inst))
+        break;
+      // If we succeed in our optimization, fall through.
+      // FALLTHROUGH
     case IC_RetainRV: {
       // If we're compiling for a target which needs a special inline-asm
       // marker to do the retainAutoreleasedReturnValue optimization,
@@ -410,7 +473,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
           break;
         }
         --BBI;
-      } while (isNoopInstruction(BBI));
+      } while (IsNoopInstruction(BBI));
 
       if (&*BBI == GetObjCArg(Inst)) {
         DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for "
@@ -429,7 +492,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
     case IC_InitWeak: {
       // objc_initWeak(p, null) => *p = null
       CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(1))) {
+      if (IsNullOrUndef(CI->getArgOperand(1))) {
         Value *Null =
           ConstantPointerNull::get(cast<PointerType>(CI->getType()));
         Changed = true;
@@ -453,6 +516,10 @@ bool ObjCARCContract::runOnFunction(Function &F) {
       if (isa<AllocaInst>(Inst))
         TailOkForStoreStrongs = false;
       continue;
+    case IC_IntrinsicUser:
+      // Remove calls to @clang.arc.use(...).
+      Inst->eraseFromParent();
+      continue;
     default:
       continue;
     }
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 9c14949..e3d51d5 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
@@ -190,13 +191,13 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) {
   do {
     const Value *V = Worklist.pop_back_val();
 
-    DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Visiting: " << *V << "\n");
+    DEBUG(dbgs() << "Visiting: " << *V << "\n");
 
     for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
          UI != UE; ++UI) {
       const User *UUser = *UI;
 
-      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User: " << *UUser << "\n");
+      DEBUG(dbgs() << "User: " << *UUser << "\n");
 
       // Special - Use by a call (callee or argument) is not considered
       // to be an escape.
@@ -206,11 +207,13 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) {
       case IC_StoreStrong:
       case IC_Autorelease:
       case IC_AutoreleaseRV: {
-        DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies pointer "
-              "arguments. Pointer Escapes!\n");
+        DEBUG(dbgs() << "User copies pointer arguments. Pointer Escapes!\n");
         // These special functions make copies of their pointer arguments.
         return true;
       }
+      case IC_IntrinsicUser:
+        // Use by the use intrinsic is not an escape.
+        continue;
       case IC_User:
       case IC_None:
         // Use by an instruction which copies the value is an escape if the
@@ -219,12 +222,11 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) {
             isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
 
           if (VisitedSet.insert(UUser)) {
-            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies value. "
-                  "Ptr escapes if result escapes. Adding to list.\n");
+            DEBUG(dbgs() << "User copies value. Ptr escapes if result escapes."
+                  " Adding to list.\n");
             Worklist.push_back(UUser);
           } else {
-            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Already visited node."
-                  "\n");
+            DEBUG(dbgs() << "Already visited node.\n");
           }
           continue;
         }
@@ -241,13 +243,13 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) {
         continue;
       }
       // Otherwise, conservatively assume an escape.
-      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Assuming ptr escapes.\n");
+      DEBUG(dbgs() << "Assuming ptr escapes.\n");
       return true;
     }
   } while (!Worklist.empty());
 
   // No escapes found.
-  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Ptr does not escape.\n");
+  DEBUG(dbgs() << "Ptr does not escape.\n");
   return false;
 }
 
@@ -301,6 +303,16 @@ STATISTIC(NumRets,        "Number of return value forwarding "
                           "retain+autoreleaes eliminated");
 STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
 STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+STATISTIC(NumRetainsBeforeOpt,
+          "Number of retains before optimization.");
+STATISTIC(NumReleasesBeforeOpt,
+          "Number of releases before optimization.");
+#ifndef NDEBUG
+STATISTIC(NumRetainsAfterOpt,
+          "Number of retains after optimization.");
+STATISTIC(NumReleasesAfterOpt,
+          "Number of releases after optimization.");
+#endif
 
 namespace {
   /// \enum Sequence
@@ -371,7 +383,7 @@ static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
 namespace {
   /// \brief Unidirectional information about either a
   /// retain-decrement-use-release sequence or release-use-decrement-retain
-  /// reverese sequence.
+  /// reverse sequence.
   struct RRInfo {
     /// After an objc_retain, the reference count of the referenced
     /// object is known to be positive. Similarly, before an objc_release, the
@@ -387,10 +399,6 @@ namespace {
     /// KnownSafe is true when either of these conditions is satisfied.
     bool KnownSafe;
 
-    /// True if the Calls are objc_retainBlock calls (as opposed to objc_retain
-    /// calls).
-    bool IsRetainBlock;
-
     /// True of the objc_release calls are all marked with the "tail" keyword.
     bool IsTailCallRelease;
 
@@ -407,17 +415,18 @@ namespace {
     SmallPtrSet<Instruction *, 2> ReverseInsertPts;
 
     RRInfo() :
-      KnownSafe(false), IsRetainBlock(false),
-      IsTailCallRelease(false),
-      ReleaseMetadata(0) {}
+      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0) {}
 
     void clear();
+
+    bool IsTrackingImpreciseReleases() {
+      return ReleaseMetadata != 0;
+    }
   };
 }
 
 void RRInfo::clear() {
   KnownSafe = false;
-  IsRetainBlock = false;
   IsTailCallRelease = false;
   ReleaseMetadata = 0;
   Calls.clear();
@@ -431,7 +440,7 @@ namespace {
     /// True if the reference count is known to be incremented.
     bool KnownPositiveRefCount;
 
-    /// True of we've seen an opportunity for partial RR elimination, such as
+    /// True if we've seen an opportunity for partial RR elimination, such as
     /// pushing calls into a CFG triangle or into one side of a CFG diamond.
     bool Partial;
 
@@ -451,15 +460,16 @@ namespace {
       KnownPositiveRefCount = true;
     }
 
-    void ClearRefCount() {
+    void ClearKnownPositiveRefCount() {
       KnownPositiveRefCount = false;
     }
 
-    bool IsKnownIncremented() const {
+    bool HasKnownPositiveRefCount() const {
       return KnownPositiveRefCount;
     }
 
     void SetSeq(Sequence NewSeq) {
+      DEBUG(dbgs() << "Old: " << Seq << "; New: " << NewSeq << "\n");
       Seq = NewSeq;
     }
 
@@ -472,7 +482,8 @@ namespace {
     }
 
     void ResetSequenceProgress(Sequence NewSeq) {
-      Seq = NewSeq;
+      DEBUG(dbgs() << "Resetting sequence progress.\n");
+      SetSeq(NewSeq);
       Partial = false;
       RRI.clear();
     }
@@ -486,10 +497,6 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
   KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
 
-  // We can't merge a plain objc_retain with an objc_retainBlock.
-  if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
-    Seq = S_None;
-
   // If we're not in a sequence (anymore), drop all associated state.
   if (Seq == S_None) {
     Partial = false;
@@ -698,6 +705,287 @@ void BBState::MergeSucc(const BBState &Other) {
       MI->second.Merge(PtrState(), /*TopDown=*/false);
 }
 
+// Only enable ARC Annotations if we are building a debug version of
+// libObjCARCOpts.
+#ifndef NDEBUG
+#define ARC_ANNOTATIONS
+#endif
+
+// Define some macros along the lines of DEBUG and some helper functions to make
+// it cleaner to create annotations in the source code and to no-op when not
+// building in debug mode.
+#ifdef ARC_ANNOTATIONS
+
+#include "llvm/Support/CommandLine.h"
+
+/// Enable/disable ARC sequence annotations.
+static cl::opt<bool>
+EnableARCAnnotations("enable-objc-arc-annotations", cl::init(false),
+                     cl::desc("Enable emission of arc data flow analysis "
+                              "annotations"));
+static cl::opt<bool>
+DisableCheckForCFGHazards("disable-objc-arc-checkforcfghazards", cl::init(false),
+                          cl::desc("Disable check for cfg hazards when "
+                                   "annotating"));
+static cl::opt<std::string>
+ARCAnnotationTargetIdentifier("objc-arc-annotation-target-identifier",
+                              cl::init(""),
+                              cl::desc("filter out all data flow annotations "
+                                       "but those that apply to the given "
+                                       "target llvm identifier."));
+
+/// This function appends a unique ARCAnnotationProvenanceSourceMDKind id to an
+/// instruction so that we can track backwards when post processing via the llvm
+/// arc annotation processor tool. If the function is an
+static MDString *AppendMDNodeToSourcePtr(unsigned NodeId,
+                                         Value *Ptr) {
+  MDString *Hash = 0;
+
+  // If pointer is a result of an instruction and it does not have a source
+  // MDNode it, attach a new MDNode onto it. If pointer is a result of
+  // an instruction and does have a source MDNode attached to it, return a
+  // reference to said Node. Otherwise just return 0.
+  if (Instruction *Inst = dyn_cast<Instruction>(Ptr)) {
+    MDNode *Node;
+    if (!(Node = Inst->getMetadata(NodeId))) {
+      // We do not have any node. Generate and attatch the hash MDString to the
+      // instruction.
+
+      // We just use an MDString to ensure that this metadata gets written out
+      // of line at the module level and to provide a very simple format
+      // encoding the information herein. Both of these makes it simpler to
+      // parse the annotations by a simple external program.
+      std::string Str;
+      raw_string_ostream os(Str);
+      os << "(" << Inst->getParent()->getParent()->getName() << ",%"
+         << Inst->getName() << ")";
+
+      Hash = MDString::get(Inst->getContext(), os.str());
+      Inst->setMetadata(NodeId, MDNode::get(Inst->getContext(),Hash));
+    } else {
+      // We have a node. Grab its hash and return it.
+      assert(Node->getNumOperands() == 1 &&
+        "An ARCAnnotationProvenanceSourceMDKind can only have 1 operand.");
+      Hash = cast<MDString>(Node->getOperand(0));
+    }
+  } else if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
+    std::string str;
+    raw_string_ostream os(str);
+    os << "(" << Arg->getParent()->getName() << ",%" << Arg->getName()
+       << ")";
+    Hash = MDString::get(Arg->getContext(), os.str());
+  }
+
+  return Hash;
+}
+
+static std::string SequenceToString(Sequence A) {
+  std::string str;
+  raw_string_ostream os(str);
+  os << A;
+  return os.str();
+}
+
+/// Helper function to change a Sequence into a String object using our overload
+/// for raw_ostream so we only have printing code in one location.
+static MDString *SequenceToMDString(LLVMContext &Context,
+                                    Sequence A) {
+  return MDString::get(Context, SequenceToString(A));
+}
+
+/// A simple function to generate a MDNode which describes the change in state
+/// for Value *Ptr caused by Instruction *Inst.
+static void AppendMDNodeToInstForPtr(unsigned NodeId,
+                                     Instruction *Inst,
+                                     Value *Ptr,
+                                     MDString *PtrSourceMDNodeID,
+                                     Sequence OldSeq,
+                                     Sequence NewSeq) {
+  MDNode *Node = 0;
+  Value *tmp[3] = {PtrSourceMDNodeID,
+                   SequenceToMDString(Inst->getContext(),
+                                      OldSeq),
+                   SequenceToMDString(Inst->getContext(),
+                                      NewSeq)};
+  Node = MDNode::get(Inst->getContext(),
+                     ArrayRef<Value*>(tmp, 3));
+
+  Inst->setMetadata(NodeId, Node);
+}
+
+/// Add to the beginning of the basic block llvm.ptr.annotations which show the
+/// state of a pointer at the entrance to a basic block.
+static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB,
+                                            Value *Ptr, Sequence Seq) {
+  // If we have a target identifier, make sure that we match it before
+  // continuing.
+  if(!ARCAnnotationTargetIdentifier.empty() &&
+     !Ptr->getName().equals(ARCAnnotationTargetIdentifier))
+    return;
+
+  Module *M = BB->getParent()->getParent();
+  LLVMContext &C = M->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+  Type *Params[] = {I8XX, I8XX};
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
+                                        ArrayRef<Type*>(Params, 2),
+                                        /*isVarArg=*/false);
+  Constant *Callee = M->getOrInsertFunction(Name, FTy);
+
+  IRBuilder<> Builder(BB, BB->getFirstInsertionPt());
+
+  Value *PtrName;
+  StringRef Tmp = Ptr->getName();
+  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
+                                                         Tmp + "_STR");
+    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                                 cast<Constant>(ActualPtrName), Tmp);
+  }
+
+  Value *S;
+  std::string SeqStr = SequenceToString(Seq);
+  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
+                                                         SeqStr + "_STR");
+    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                           cast<Constant>(ActualPtrName), SeqStr);
+  }
+
+  Builder.CreateCall2(Callee, PtrName, S);
+}
+
+/// Add to the end of the basic block llvm.ptr.annotations which show the state
+/// of the pointer at the bottom of the basic block.
+static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB,
+                                              Value *Ptr, Sequence Seq) {
+  // If we have a target identifier, make sure that we match it before emitting
+  // an annotation.
+  if(!ARCAnnotationTargetIdentifier.empty() &&
+     !Ptr->getName().equals(ARCAnnotationTargetIdentifier))
+    return;
+
+  Module *M = BB->getParent()->getParent();
+  LLVMContext &C = M->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+  Type *Params[] = {I8XX, I8XX};
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
+                                        ArrayRef<Type*>(Params, 2),
+                                        /*isVarArg=*/false);
+  Constant *Callee = M->getOrInsertFunction(Name, FTy);
+
+  IRBuilder<> Builder(BB, llvm::prior(BB->end()));
+
+  Value *PtrName;
+  StringRef Tmp = Ptr->getName();
+  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
+                                                         Tmp + "_STR");
+    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                                 cast<Constant>(ActualPtrName), Tmp);
+  }
+
+  Value *S;
+  std::string SeqStr = SequenceToString(Seq);
+  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
+                                                         SeqStr + "_STR");
+    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                           cast<Constant>(ActualPtrName), SeqStr);
+  }
+  Builder.CreateCall2(Callee, PtrName, S);
+}
+
+/// Adds a source annotation to pointer and a state change annotation to Inst
+/// referencing the source annotation and the old/new state of pointer.
+static void GenerateARCAnnotation(unsigned InstMDId,
+                                  unsigned PtrMDId,
+                                  Instruction *Inst,
+                                  Value *Ptr,
+                                  Sequence OldSeq,
+                                  Sequence NewSeq) {
+  if (EnableARCAnnotations) {
+    // If we have a target identifier, make sure that we match it before
+    // emitting an annotation.
+    if(!ARCAnnotationTargetIdentifier.empty() &&
+       !Ptr->getName().equals(ARCAnnotationTargetIdentifier))
+      return;
+
+    // First generate the source annotation on our pointer. This will return an
+    // MDString* if Ptr actually comes from an instruction implying we can put
+    // in a source annotation. If AppendMDNodeToSourcePtr returns 0 (i.e. NULL),
+    // then we know that our pointer is from an Argument so we put a reference
+    // to the argument number.
+    //
+    // The point of this is to make it easy for the
+    // llvm-arc-annotation-processor tool to cross reference where the source
+    // pointer is in the LLVM IR since the LLVM IR parser does not submit such
+    // information via debug info for backends to use (since why would anyone
+    // need such a thing from LLVM IR besides in non standard cases
+    // [i.e. this]).
+    MDString *SourcePtrMDNode =
+      AppendMDNodeToSourcePtr(PtrMDId, Ptr);
+    AppendMDNodeToInstForPtr(InstMDId, Inst, Ptr, SourcePtrMDNode, OldSeq,
+                             NewSeq);
+  }
+}
+
+// The actual interface for accessing the above functionality is defined via
+// some simple macros which are defined below. We do this so that the user does
+// not need to pass in what metadata id is needed resulting in cleaner code and
+// additionally since it provides an easy way to conditionally no-op all
+// annotation support in a non-debug build.
+
+/// Use this macro to annotate a sequence state change when processing
+/// instructions bottom up,
+#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)                          \
+  GenerateARCAnnotation(ARCAnnotationBottomUpMDKind,                    \
+                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
+                        const_cast<Value*>(ptr), (old), (new))
+/// Use this macro to annotate a sequence state change when processing
+/// instructions top down.
+#define ANNOTATE_TOPDOWN(inst, ptr, old, new)                           \
+  GenerateARCAnnotation(ARCAnnotationTopDownMDKind,                     \
+                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
+                        const_cast<Value*>(ptr), (old), (new))
+
+#define ANNOTATE_BB(_states, _bb, _name, _type, _direction)                   \
+  do {                                                                        \
+    if (EnableARCAnnotations) {                                               \
+      for(BBState::ptr_const_iterator I = (_states)._direction##_ptr_begin(), \
+          E = (_states)._direction##_ptr_end(); I != E; ++I) {                \
+        Value *Ptr = const_cast<Value*>(I->first);                            \
+        Sequence Seq = I->second.GetSeq();                                    \
+        GenerateARCBB ## _type ## Annotation(_name, (_bb), Ptr, Seq);         \
+      }                                                                       \
+    }                                                                         \
+  } while (0)
+
+#define ANNOTATE_BOTTOMUP_BBSTART(_states, _basicblock)                       \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbstart", \
+                Entrance, bottom_up)
+#define ANNOTATE_BOTTOMUP_BBEND(_states, _basicblock)                         \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbend",   \
+                Terminator, bottom_up)
+#define ANNOTATE_TOPDOWN_BBSTART(_states, _basicblock)                        \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbstart",  \
+                Entrance, top_down)
+#define ANNOTATE_TOPDOWN_BBEND(_states, _basicblock)                          \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbend",    \
+                Terminator, top_down)
+
+#else // !ARC_ANNOTATION
+// If annotations are off, noop.
+#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)
+#define ANNOTATE_TOPDOWN(inst, ptr, old, new)
+#define ANNOTATE_BOTTOMUP_BBSTART(states, basicblock)
+#define ANNOTATE_BOTTOMUP_BBEND(states, basicblock)
+#define ANNOTATE_TOPDOWN_BBSTART(states, basicblock)
+#define ANNOTATE_TOPDOWN_BBEND(states, basicblock)
+#endif // !ARC_ANNOTATION
+
 namespace {
   /// \brief The main ARC optimization pass.
   class ObjCARCOpt : public FunctionPass {
@@ -711,9 +999,6 @@ namespace {
     /// them. These are initialized lazily to avoid cluttering up the Module
     /// with unused declarations.
 
-    /// Declaration for ObjC runtime function
-    /// objc_retainAutoreleasedReturnValue.
-    Constant *RetainRVCallee;
     /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
     Constant *AutoreleaseRVCallee;
     /// Declaration for ObjC runtime function objc_release.
@@ -738,7 +1023,15 @@ namespace {
     /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata.
     unsigned NoObjCARCExceptionsMDKind;
 
-    Constant *getRetainRVCallee(Module *M);
+#ifdef ARC_ANNOTATIONS
+    /// The Metadata Kind for llvm.arc.annotation.bottomup metadata.
+    unsigned ARCAnnotationBottomUpMDKind;
+    /// The Metadata Kind for llvm.arc.annotation.topdown metadata.
+    unsigned ARCAnnotationTopDownMDKind;
+    /// The Metadata Kind for llvm.arc.annotation.provenancesource metadata.
+    unsigned ARCAnnotationProvenanceSourceMDKind;
+#endif // ARC_ANNOATIONS
+
     Constant *getAutoreleaseRVCallee(Module *M);
     Constant *getReleaseCallee(Module *M);
     Constant *getRetainCallee(Module *M);
@@ -747,10 +1040,11 @@ namespace {
 
     bool IsRetainBlockOptimizable(const Instruction *Inst);
 
-    void OptimizeRetainCall(Function &F, Instruction *Retain);
     bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
     void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
                                    InstructionClass &Class);
+    bool OptimizeRetainBlockCall(Function &F, Instruction *RetainBlock,
+                                 InstructionClass &Class);
     void OptimizeIndividualCalls(Function &F);
 
     void CheckForCFGHazards(const BasicBlock *BB,
@@ -804,6 +1098,10 @@ namespace {
 
     void OptimizeReturns(Function &F);
 
+#ifndef NDEBUG
+    void GatherStatistics(Function &F, bool AfterOptimization = false);
+#endif
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
     virtual bool doInitialization(Module &M);
     virtual bool runOnFunction(Function &F);
@@ -851,22 +1149,6 @@ bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
   return true;
 }
 
-Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
-  if (!RetainRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    RetainRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
-                             Attribute);
-  }
-  return RetainRVCallee;
-}
-
 Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
   if (!AutoreleaseRVCallee) {
     LLVMContext &C = M->getContext();
@@ -946,38 +1228,6 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   return AutoreleaseCallee;
 }
 
-/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
-/// return value.
-void
-ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
-  ImmutableCallSite CS(GetObjCArg(Retain));
-  const Instruction *Call = CS.getInstruction();
-  if (!Call) return;
-  if (Call->getParent() != Retain->getParent()) return;
-
-  // Check that the call is next to the retain.
-  BasicBlock::const_iterator I = Call;
-  ++I;
-  while (isNoopInstruction(I)) ++I;
-  if (&*I != Retain)
-    return;
-
-  // Turn it to an objc_retainAutoreleasedReturnValue..
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming "
-                  "objc_retain => objc_retainAutoreleasedReturnValue"
-                  " since the operand is a return value.\n"
-                  "                                Old: "
-               << *Retain << "\n");
-
-  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                New: "
-               << *Retain << "\n");
-}
-
 /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
 /// not a return value.  Or, if it can be paired with an
 /// objc_autoreleaseReturnValue, delete the pair and return true.
@@ -990,14 +1240,14 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
     if (Call->getParent() == RetainRV->getParent()) {
       BasicBlock::const_iterator I = Call;
       ++I;
-      while (isNoopInstruction(I)) ++I;
+      while (IsNoopInstruction(I)) ++I;
       if (&*I == RetainRV)
         return false;
     } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       BasicBlock *RetainRVParent = RetainRV->getParent();
       if (II->getNormalDest() == RetainRVParent) {
         BasicBlock::const_iterator I = RetainRVParent->begin();
-        while (isNoopInstruction(I)) ++I;
+        while (IsNoopInstruction(I)) ++I;
         if (&*I == RetainRV)
           return false;
       }
@@ -1008,15 +1258,14 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   // pointer. In this case, we can delete the pair.
   BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
   if (I != Begin) {
-    do --I; while (I != Begin && isNoopInstruction(I));
+    do --I; while (I != Begin && IsNoopInstruction(I));
     if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
         GetObjCArg(I) == Arg) {
       Changed = true;
       ++NumPeeps;
 
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n"
-                   << "                                  Erasing " << *RetainRV
-                   << "\n");
+      DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"
+                   << "Erasing " << *RetainRV << "\n");
 
       EraseInstruction(I);
       EraseInstruction(RetainRV);
@@ -1028,16 +1277,13 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   Changed = true;
   ++NumPeeps;
 
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming "
-                  "objc_retainAutoreleasedReturnValue => "
+  DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
                   "objc_retain since the operand is not a return value.\n"
-                  "                                  Old: "
-               << *RetainRV << "\n");
+                  "Old = " << *RetainRV << "\n");
 
   cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
 
-  DEBUG(dbgs() << "                                  New: "
-               << *RetainRV << "\n");
+  DEBUG(dbgs() << "New = " << *RetainRV << "\n");
 
   return false;
 }
@@ -1066,12 +1312,10 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
   Changed = true;
   ++NumPeeps;
 
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming "
-                  "objc_autoreleaseReturnValue => "
+  DEBUG(dbgs() << "Transforming objc_autoreleaseReturnValue => "
                   "objc_autorelease since its operand is not used as a return "
                   "value.\n"
-                  "                                       Old: "
-               << *AutoreleaseRV << "\n");
+                  "Old = " << *AutoreleaseRV << "\n");
 
   CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
   AutoreleaseRVCI->
@@ -1079,14 +1323,48 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
   AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
   Class = IC_Autorelease;
 
-  DEBUG(dbgs() << "                                       New: "
-               << *AutoreleaseRV << "\n");
+  DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
+
+}
+
+// \brief Attempt to strength reduce objc_retainBlock calls to objc_retain
+// calls.
+//
+// Specifically: If an objc_retainBlock call has the copy_on_escape metadata and
+// does not escape (following the rules of block escaping), strength reduce the
+// objc_retainBlock to an objc_retain.
+//
+// TODO: If an objc_retainBlock call is dominated period by a previous
+// objc_retainBlock call, strength reduce the objc_retainBlock to an
+// objc_retain.
+bool
+ObjCARCOpt::OptimizeRetainBlockCall(Function &F, Instruction *Inst,
+                                    InstructionClass &Class) {
+  assert(GetBasicInstructionClass(Inst) == Class);
+  assert(IC_RetainBlock == Class);
+
+  // If we can not optimize Inst, return false.
+  if (!IsRetainBlockOptimizable(Inst))
+    return false;
 
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "Strength reduced retainBlock => retain.\n");
+  DEBUG(dbgs() << "Old: " << *Inst << "\n");
+  CallInst *RetainBlock = cast<CallInst>(Inst);
+  RetainBlock->setCalledFunction(getRetainCallee(F.getParent()));
+  // Remove copy_on_escape metadata.
+  RetainBlock->setMetadata(CopyOnEscapeMDKind, 0);
+  Class = IC_Retain;
+  DEBUG(dbgs() << "New: " << *Inst << "\n");
+  return true;
 }
 
 /// Visit each call, one at a time, and make simplifications without doing any
 /// additional analysis.
 void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
   // Reset all the flags in preparation for recomputing them.
   UsedInThisFunction = 0;
 
@@ -1096,8 +1374,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
 
     InstructionClass Class = GetBasicInstructionClass(Inst);
 
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: Class: "
-          << Class << "; " << *Inst << "\n");
+    DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
 
     switch (Class) {
     default: break;
@@ -1113,8 +1390,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_NoopCast:
       Changed = true;
       ++NumNoops;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:"
-                   " " << *Inst << "\n");
+      DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
       EraseInstruction(Inst);
       continue;
 
@@ -1125,18 +1401,15 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_InitWeak:
     case IC_DestroyWeak: {
       CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0))) {
+      if (IsNullOrUndef(CI->getArgOperand(0))) {
         Changed = true;
         Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
                       Constant::getNullValue(Ty),
                       CI);
         llvm::Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
-                        "pointer-to-weak-pointer is undefined behavior.\n"
-                        "                                     Old = " << *CI <<
-                        "\n                                     New = " <<
-                        *NewValue << "\n");
+        DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                       "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
         CI->replaceAllUsesWith(NewValue);
         CI->eraseFromParent();
         continue;
@@ -1146,8 +1419,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_CopyWeak:
     case IC_MoveWeak: {
       CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0)) ||
-          isNullOrUndef(CI->getArgOperand(1))) {
+      if (IsNullOrUndef(CI->getArgOperand(0)) ||
+          IsNullOrUndef(CI->getArgOperand(1))) {
         Changed = true;
         Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
@@ -1155,11 +1428,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
                       CI);
 
         llvm::Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
-                        "pointer-to-weak-pointer is undefined behavior.\n"
-                        "                                     Old = " << *CI <<
-                        "\n                                     New = " <<
-                        *NewValue << "\n");
+        DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                        "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
 
         CI->replaceAllUsesWith(NewValue);
         CI->eraseFromParent();
@@ -1167,8 +1437,14 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       }
       break;
     }
+    case IC_RetainBlock:
+      // If we strength reduce an objc_retainBlock to an objc_retain, continue
+      // onto the objc_retain peephole optimizations. Otherwise break.
+      if (!OptimizeRetainBlockCall(F, Inst, Class))
+        break;
+      // FALLTHROUGH
     case IC_Retain:
-      OptimizeRetainCall(F, Inst);
+      ++NumRetainsBeforeOpt;
       break;
     case IC_RetainRV:
       if (OptimizeRetainRVCall(F, Inst))
@@ -1177,6 +1453,9 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_AutoreleaseRV:
       OptimizeAutoreleaseRVCall(F, Inst, Class);
       break;
+    case IC_Release:
+      ++NumReleasesBeforeOpt;
+      break;
     }
 
     // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
@@ -1196,12 +1475,9 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
         NewCall->setMetadata(ImpreciseReleaseMDKind,
                              MDNode::get(C, ArrayRef<Value *>()));
 
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing "
-                        "objc_autorelease(x) with objc_release(x) since x is "
-                        "otherwise unused.\n"
-                        "                                     Old: " << *Call <<
-                        "\n                                     New: " <<
-                        *NewCall << "\n");
+        DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
+              "since x is otherwise unused.\nOld: " << *Call << "\nNew: "
+              << *NewCall << "\n");
 
         EraseInstruction(Call);
         Inst = NewCall;
@@ -1213,9 +1489,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     // a tail keyword.
     if (IsAlwaysTail(Class)) {
       Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword"
-            " to function since it can never be passed stack args: " << *Inst <<
-            "\n");
+      DEBUG(dbgs() << "Adding tail keyword to function since it can never be "
+                      "passed stack args: " << *Inst << "\n");
       cast<CallInst>(Inst)->setTailCall();
     }
 
@@ -1223,8 +1498,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     // semantics of ARC truly do not do so.
     if (IsNeverTail(Class)) {
       Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Removing tail "
-            "keyword from function: " << *Inst <<
+      DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst <<
             "\n");
       cast<CallInst>(Inst)->setTailCall(false);
     }
@@ -1232,8 +1506,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     // Set nounwind as needed.
     if (IsNoThrow(Class)) {
       Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw"
-            " class. Setting nounwind on: " << *Inst << "\n");
+      DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst
+                   << "\n");
       cast<CallInst>(Inst)->setDoesNotThrow();
     }
 
@@ -1245,11 +1519,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     const Value *Arg = GetObjCArg(Inst);
 
     // ARC calls with null are no-ops. Delete them.
-    if (isNullOrUndef(Arg)) {
+    if (IsNullOrUndef(Arg)) {
       Changed = true;
       ++NumNoops;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with "
-            " null are no-ops. Erasing: " << *Inst << "\n");
+      DEBUG(dbgs() << "ARC calls with  null are no-ops. Erasing: " << *Inst
+            << "\n");
       EraseInstruction(Inst);
       continue;
     }
@@ -1280,7 +1554,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         Value *Incoming =
           StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-        if (isNullOrUndef(Incoming))
+        if (IsNullOrUndef(Incoming))
           HasNull = true;
         else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
                    .getNumSuccessors() != 1) {
@@ -1334,7 +1608,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
           for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
             Value *Incoming =
               StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-            if (!isNullOrUndef(Incoming)) {
+            if (!IsNullOrUndef(Incoming)) {
               CallInst *Clone = cast<CallInst>(CInst->clone());
               Value *Op = PN->getIncomingValue(i);
               Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
@@ -1343,10 +1617,9 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
               Clone->setArgOperand(0, Op);
               Clone->insertBefore(InsertPos);
 
-              DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Cloning "
+              DEBUG(dbgs() << "Cloning "
                            << *CInst << "\n"
-                           "                                     And inserting "
-                           "clone at " << *InsertPos << "\n");
+                           "And inserting clone at " << *InsertPos << "\n");
               Worklist.push_back(std::make_pair(Clone, Incoming));
             }
           }
@@ -1358,7 +1631,65 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       }
     } while (!Worklist.empty());
   }
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished List.\n");
+}
+
+/// If we have a top down pointer in the S_Use state, make sure that there are
+/// no CFG hazards by checking the states of various bottom up pointers.
+static void CheckForUseCFGHazard(const Sequence SuccSSeq,
+                                 const bool SuccSRRIKnownSafe,
+                                 PtrState &S,
+                                 bool &SomeSuccHasSame,
+                                 bool &AllSuccsHaveSame,
+                                 bool &ShouldContinue) {
+  switch (SuccSSeq) {
+  case S_CanRelease: {
+    if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+      S.ClearSequenceProgress();
+      break;
+    }
+    ShouldContinue = true;
+    break;
+  }
+  case S_Use:
+    SomeSuccHasSame = true;
+    break;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+      AllSuccsHaveSame = false;
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  case S_None:
+    llvm_unreachable("This should have been handled earlier.");
+  }
+}
+
+/// If we have a Top Down pointer in the S_CanRelease state, make sure that
+/// there are no CFG hazards by checking the states of various bottom up
+/// pointers.
+static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
+                                        const bool SuccSRRIKnownSafe,
+                                        PtrState &S,
+                                        bool &SomeSuccHasSame,
+                                        bool &AllSuccsHaveSame) {
+  switch (SuccSSeq) {
+  case S_CanRelease:
+    SomeSuccHasSame = true;
+    break;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Use:
+    if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+      AllSuccsHaveSame = false;
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  case S_None:
+    llvm_unreachable("This should have been handled earlier.");
+  }
 }
 
 /// Check for critical edges, loop boundaries, irreducible control flow, or
@@ -1371,106 +1702,82 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
   // If any top-down local-use or possible-dec has a succ which is earlier in
   // the sequence, forget it.
   for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
-       E = MyStates.top_down_ptr_end(); I != E; ++I)
-    switch (I->second.GetSeq()) {
-    default: break;
-    case S_Use: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None:
-        case S_CanRelease: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
-          continue;
-        }
-        case S_Use:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
+         E = MyStates.top_down_ptr_end(); I != E; ++I) {
+    PtrState &S = I->second;
+    const Sequence Seq = I->second.GetSeq();
+
+    // We only care about S_Retain, S_CanRelease, and S_Use.
+    if (Seq == S_None)
+      continue;
+
+    // Make sure that if extra top down states are added in the future that this
+    // code is updated to handle it.
+    assert((Seq == S_Retain || Seq == S_CanRelease || Seq == S_Use) &&
+           "Unknown top down sequence state.");
+
+    const Value *Arg = I->first;
+    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    bool SomeSuccHasSame = false;
+    bool AllSuccsHaveSame = true;
+
+    succ_const_iterator SI(TI), SE(TI, false);
+
+    for (; SI != SE; ++SI) {
+      // If VisitBottomUp has pointer information for this successor, take
+      // what we know about it.
+      const DenseMap<const BasicBlock *, BBState>::iterator BBI =
+        BBStates.find(*SI);
+      assert(BBI != BBStates.end());
+      const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+      const Sequence SuccSSeq = SuccS.GetSeq();
+
+      // If bottom up, the pointer is in an S_None state, clear the sequence
+      // progress since the sequence in the bottom up state finished
+      // suggesting a mismatch in between retains/releases. This is true for
+      // all three cases that we are handling here: S_Retain, S_Use, and
+      // S_CanRelease.
+      if (SuccSSeq == S_None) {
         S.ClearSequenceProgress();
-      break;
-    }
-    case S_CanRelease: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
+        continue;
+      }
+
+      // If we have S_Use or S_CanRelease, perform our check for cfg hazard
+      // checks.
+      const bool SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+
+      // *NOTE* We do not use Seq from above here since we are allowing for
+      // S.GetSeq() to change while we are visiting basic blocks.
+      switch(S.GetSeq()) {
+      case S_Use: {
+        bool ShouldContinue = false;
+        CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
+                             SomeSuccHasSame, AllSuccsHaveSame,
+                             ShouldContinue);
+        if (ShouldContinue)
           continue;
-        }
-        case S_CanRelease:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-        case S_Use:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
+        break;
+      }
+      case S_CanRelease: {
+        CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe,
+                                    S, SomeSuccHasSame,
+                                    AllSuccsHaveSame);
+        break;
+      }
+      case S_Retain:
+      case S_None:
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        break;
       }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
-        S.ClearSequenceProgress();
-      break;
-    }
     }
+
+    // If the state at the other end of any of the successor edges
+    // matches the current state, require all edges to match. This
+    // guards against loops in the middle of a sequence.
+    if (SomeSuccHasSame && !AllSuccsHaveSame)
+      S.ClearSequenceProgress();
+  }
 }
 
 bool
@@ -1482,6 +1789,8 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
   InstructionClass Class = GetInstructionClass(Inst);
   const Value *Arg = 0;
 
+  DEBUG(dbgs() << "Class: " << Class << "\n");
+
   switch (Class) {
   case IC_Release: {
     Arg = GetObjCArg(Inst);
@@ -1496,27 +1805,26 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     // pairs by making PtrState hold a stack of states, but this is
     // simple and avoids adding overhead for the non-nested case.
     if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) {
-      DEBUG(dbgs() << "ObjCARCOpt::VisitInstructionBottomUp: Found nested "
-                      "releases (i.e. a release pair)\n");
+      DEBUG(dbgs() << "Found nested releases (i.e. a release pair)\n");
       NestingDetected = true;
     }
 
     MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-    S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
+    Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
+    ANNOTATE_BOTTOMUP(Inst, Arg, S.GetSeq(), NewSeq);
+    S.ResetSequenceProgress(NewSeq);
     S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.IsKnownIncremented();
+    S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
     S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
     S.RRI.Calls.insert(Inst);
-
     S.SetKnownPositiveRefCount();
     break;
   }
   case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
   case IC_Retain:
   case IC_RetainRV: {
     Arg = GetObjCArg(Inst);
@@ -1524,20 +1832,22 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
     S.SetKnownPositiveRefCount();
 
-    switch (S.GetSeq()) {
+    Sequence OldSeq = S.GetSeq();
+    switch (OldSeq) {
     case S_Stop:
     case S_Release:
     case S_MovableRelease:
     case S_Use:
-      S.RRI.ReverseInsertPts.clear();
+      // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an
+      // imprecise release, clear our reverse insertion points.
+      if (OldSeq != S_Use || S.RRI.IsTrackingImpreciseReleases())
+        S.RRI.ReverseInsertPts.clear();
       // FALL THROUGH
     case S_CanRelease:
       // Don't do retain+release tracking for IC_RetainRV, because it's
       // better to let it remain as the first instruction after a call.
-      if (Class != IC_RetainRV) {
-        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+      if (Class != IC_RetainRV)
         Retains[Inst] = S.RRI;
-      }
       S.ClearSequenceProgress();
       break;
     case S_None:
@@ -1545,7 +1855,9 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     case S_Retain:
       llvm_unreachable("bottom-up pointer in retain state!");
     }
-    return NestingDetected;
+    ANNOTATE_BOTTOMUP(Inst, Arg, OldSeq, S.GetSeq());
+    // A retain moving bottom up can be a use.
+    break;
   }
   case IC_AutoreleasepoolPop:
     // Conservatively, clear MyStates for all known pointers.
@@ -1571,10 +1883,13 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
 
     // Check for possible releases.
     if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
+      DEBUG(dbgs() << "CanAlterRefCount: Seq: " << Seq << "; " << *Ptr
+            << "\n");
+      S.ClearKnownPositiveRefCount();
       switch (Seq) {
       case S_Use:
         S.SetSeq(S_CanRelease);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S.GetSeq());
         continue;
       case S_CanRelease:
       case S_Release:
@@ -1592,6 +1907,8 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     case S_Release:
     case S_MovableRelease:
       if (CanUse(Inst, Ptr, PA, Class)) {
+        DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr
+              << "\n");
         assert(S.RRI.ReverseInsertPts.empty());
         // If this is an invoke instruction, we're scanning it as part of
         // one of its successor blocks, since we can't insert code after it
@@ -1601,10 +1918,13 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
         else
           S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
         S.SetSeq(S_Use);
-      } else if (Seq == S_Release &&
-                 (Class == IC_User || Class == IC_CallOrUser)) {
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
+      } else if (Seq == S_Release && IsUser(Class)) {
+        DEBUG(dbgs() << "PreciseReleaseUse: Seq: " << Seq << "; " << *Ptr
+              << "\n");
         // Non-movable releases depend on any possible objc pointer use.
         S.SetSeq(S_Stop);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, S_Release, S_Stop);
         assert(S.RRI.ReverseInsertPts.empty());
         // As above; handle invoke specially.
         if (isa<InvokeInst>(Inst))
@@ -1614,8 +1934,12 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
       }
       break;
     case S_Stop:
-      if (CanUse(Inst, Ptr, PA, Class))
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        DEBUG(dbgs() << "PreciseStopUse: Seq: " << Seq << "; " << *Ptr
+              << "\n");
         S.SetSeq(S_Use);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
+      }
       break;
     case S_CanRelease:
     case S_Use:
@@ -1633,6 +1957,9 @@ bool
 ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
                           DenseMap<const BasicBlock *, BBState> &BBStates,
                           MapVector<Value *, RRInfo> &Retains) {
+
+  DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
+
   bool NestingDetected = false;
   BBState &MyStates = BBStates[BB];
 
@@ -1654,6 +1981,10 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     }
   }
 
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // bottom of the basic block.
+  ANNOTATE_BOTTOMUP_BBEND(MyStates, BB);
+
   // Visit all the instructions, bottom-up.
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
     Instruction *Inst = llvm::prior(I);
@@ -1662,7 +1993,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     if (isa<InvokeInst>(Inst))
       continue;
 
-    DEBUG(dbgs() << "ObjCARCOpt::VisitButtonUp: Visiting " << *Inst << "\n");
+    DEBUG(dbgs() << "Visiting " << *Inst << "\n");
 
     NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
   }
@@ -1677,6 +2008,10 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
   }
 
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // top of the basic block.
+  ANNOTATE_BOTTOMUP_BBSTART(MyStates, BB);
+
   return NestingDetected;
 }
 
@@ -1690,11 +2025,10 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
   switch (Class) {
   case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
   case IC_Retain:
   case IC_RetainRV: {
     Arg = GetObjCArg(Inst);
@@ -1714,9 +2048,9 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
       if (S.GetSeq() == S_Retain)
         NestingDetected = true;
 
+      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_Retain);
       S.ResetSequenceProgress(S_Retain);
-      S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      S.RRI.KnownSafe = S.IsKnownIncremented();
+      S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
       S.RRI.Calls.insert(Inst);
     }
 
@@ -1730,17 +2064,23 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.ClearRefCount();
+    S.ClearKnownPositiveRefCount();
+
+    Sequence OldSeq = S.GetSeq();
 
-    switch (S.GetSeq()) {
+    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+
+    switch (OldSeq) {
     case S_Retain:
     case S_CanRelease:
-      S.RRI.ReverseInsertPts.clear();
+      if (OldSeq == S_Retain || ReleaseMetadata != 0)
+        S.RRI.ReverseInsertPts.clear();
       // FALL THROUGH
     case S_Use:
-      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+      S.RRI.ReleaseMetadata = ReleaseMetadata;
       S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
       Releases[Inst] = S.RRI;
+      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_None);
       S.ClearSequenceProgress();
       break;
     case S_None:
@@ -1776,10 +2116,13 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
     // Check for possible releases.
     if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
+      DEBUG(dbgs() << "CanAlterRefCount: Seq: " << Seq << "; " << *Ptr
+            << "\n");
+      S.ClearKnownPositiveRefCount();
       switch (Seq) {
       case S_Retain:
         S.SetSeq(S_CanRelease);
+        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_CanRelease);
         assert(S.RRI.ReverseInsertPts.empty());
         S.RRI.ReverseInsertPts.insert(Inst);
 
@@ -1801,8 +2144,12 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     // Check for possible direct uses.
     switch (Seq) {
     case S_CanRelease:
-      if (CanUse(Inst, Ptr, PA, Class))
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr
+              << "\n");
         S.SetSeq(S_Use);
+        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_Use);
+      }
       break;
     case S_Retain:
     case S_Use:
@@ -1822,6 +2169,7 @@ bool
 ObjCARCOpt::VisitTopDown(BasicBlock *BB,
                          DenseMap<const BasicBlock *, BBState> &BBStates,
                          DenseMap<Value *, RRInfo> &Releases) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
   bool NestingDetected = false;
   BBState &MyStates = BBStates[BB];
 
@@ -1843,15 +2191,26 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
     }
   }
 
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // top of the basic block.
+  ANNOTATE_TOPDOWN_BBSTART(MyStates, BB);
+
   // Visit all the instructions, top-down.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     Instruction *Inst = I;
 
-    DEBUG(dbgs() << "ObjCARCOpt::VisitTopDown: Visiting " << *Inst << "\n");
+    DEBUG(dbgs() << "Visiting " << *Inst << "\n");
 
     NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
   }
 
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // bottom of the basic block.
+  ANNOTATE_TOPDOWN_BBEND(MyStates, BB);
+
+#ifdef ARC_ANNOTATIONS
+  if (!(EnableARCAnnotations && DisableCheckForCFGHazards))
+#endif
   CheckForCFGHazards(BB, BBStates, MyStates);
   return NestingDetected;
 }
@@ -1983,6 +2342,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
   Type *ArgTy = Arg->getType();
   Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
 
+  DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
+
   // Insert the new retain and release calls.
   for (SmallPtrSet<Instruction *, 2>::const_iterator
        PI = ReleasesToMove.ReverseInsertPts.begin(),
@@ -1991,20 +2352,12 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     Value *MyArg = ArgTy == ParamTy ? Arg :
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
     CallInst *Call =
-      CallInst::Create(RetainsToMove.IsRetainBlock ?
-                         getRetainBlockCallee(M) : getRetainCallee(M),
-                       MyArg, "", InsertPt);
+      CallInst::Create(getRetainCallee(M), MyArg, "", InsertPt);
     Call->setDoesNotThrow();
-    if (RetainsToMove.IsRetainBlock)
-      Call->setMetadata(CopyOnEscapeMDKind,
-                        MDNode::get(M->getContext(), ArrayRef<Value *>()));
-    else
-      Call->setTailCall();
+    Call->setTailCall();
 
-    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Release: " << *Call
-                 << "\n"
-                    "                       At insertion point: " << *InsertPt
-                 << "\n");
+    DEBUG(dbgs() << "Inserting new Retain: " << *Call << "\n"
+                    "At insertion point: " << *InsertPt << "\n");
   }
   for (SmallPtrSet<Instruction *, 2>::const_iterator
        PI = RetainsToMove.ReverseInsertPts.begin(),
@@ -2021,10 +2374,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     if (ReleasesToMove.IsTailCallRelease)
       Call->setTailCall();
 
-    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Retain: " << *Call
-                 << "\n"
-                    "                       At insertion point: " << *InsertPt
-                 << "\n");
+    DEBUG(dbgs() << "Inserting new Release: " << *Call << "\n"
+                    "At insertion point: " << *InsertPt << "\n");
   }
 
   // Delete the original retain and release calls.
@@ -2034,8 +2385,7 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     Instruction *OrigRetain = *AI;
     Retains.blot(OrigRetain);
     DeadInsts.push_back(OrigRetain);
-    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting retain: " << *OrigRetain <<
-                    "\n");
+    DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
   }
   for (SmallPtrSet<Instruction *, 2>::const_iterator
        AI = ReleasesToMove.Calls.begin(),
@@ -2043,9 +2393,9 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     Instruction *OrigRelease = *AI;
     Releases.erase(OrigRelease);
     DeadInsts.push_back(OrigRelease);
-    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting release: " << *OrigRelease
-                 << "\n");
+    DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
   }
+
 }
 
 bool
@@ -2075,7 +2425,6 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
   unsigned OldCount = 0;
   unsigned NewCount = 0;
   bool FirstRelease = true;
-  bool FirstRetain = true;
   for (;;) {
     for (SmallVectorImpl<Instruction *>::const_iterator
            NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
@@ -2156,16 +2505,6 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
           OldDelta += PathCount;
           OldCount += PathCount;
 
-          // Merge the IsRetainBlock values.
-          if (FirstRetain) {
-            RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock;
-            FirstRetain = false;
-          } else if (ReleasesToMove.IsRetainBlock !=
-                     NewReleaseRetainRRI.IsRetainBlock)
-            // It's not possible to merge the sequences if one uses
-            // objc_retain and the other uses objc_retainBlock.
-            return false;
-
           // Collect the optimal insertion points.
           if (!KnownSafe)
             for (SmallPtrSet<Instruction *, 2>::const_iterator
@@ -2210,6 +2549,12 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
   if (OldDelta != 0)
     return false;
 
+#ifdef ARC_ANNOTATIONS
+  // Do not move calls if ARC annotations are requested.
+  if (EnableARCAnnotations)
+    return false;
+#endif // ARC_ANNOTATIONS
+
   Changed = true;
   assert(OldCount != 0 && "Unreachable code?");
   NumRRs += OldCount - NewCount;
@@ -2228,6 +2573,8 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
                                  MapVector<Value *, RRInfo> &Retains,
                                  DenseMap<Value *, RRInfo> &Releases,
                                  Module *M) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
+
   bool AnyPairsCompletelyEliminated = false;
   RRInfo RetainsToMove;
   RRInfo ReleasesToMove;
@@ -2243,8 +2590,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
 
     Instruction *Retain = cast<Instruction>(V);
 
-    DEBUG(dbgs() << "ObjCARCOpt::PerformCodePlacement: Visiting: " << *Retain
-          << "\n");
+    DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
 
     Value *Arg = GetObjCArg(Retain);
 
@@ -2295,14 +2641,15 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
 
 /// Weak pointer optimizations.
 void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
+
   // First, do memdep-style RLE and S2L optimizations. We can't use memdep
   // itself because it uses AliasAnalysis and we need to do provenance
   // queries instead.
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
     Instruction *Inst = &*I++;
 
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst <<
-          "\n");
+    DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
 
     InstructionClass Class = GetBasicInstructionClass(Inst);
     if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
@@ -2392,6 +2739,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
         goto clobbered;
       case IC_AutoreleasepoolPush:
       case IC_None:
+      case IC_IntrinsicUser:
       case IC_User:
         // Weak pointers are only modified through the weak entry points
         // (and arbitrary calls, which could call the weak entry points).
@@ -2449,9 +2797,6 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
     done:;
     }
   }
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n");
-
 }
 
 /// Identify program paths which execute sequences of retains and releases which
@@ -2476,6 +2821,88 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
          NestingDetected;
 }
 
+/// Check if there is a dependent call earlier that does not have anything in
+/// between the Retain and the call that can affect the reference count of their
+/// shared pointer argument. Note that Retain need not be in BB.
+static bool
+HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
+                             SmallPtrSet<Instruction *, 4> &DepInsts,
+                             SmallPtrSet<const BasicBlock *, 4> &Visited,
+                             ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
+                   DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return false;
+
+  CallInst *Call =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+
+  // Check that the pointer is the return value of the call.
+  if (!Call || Arg != Call)
+    return false;
+
+  // Check that the call is a regular call.
+  InstructionClass Class = GetBasicInstructionClass(Call);
+  if (Class != IC_CallOrUser && Class != IC_Call)
+    return false;
+
+  return true;
+}
+
+/// Find a dependent retain that precedes the given autorelease for which there
+/// is nothing in between the two instructions that can affect the ref count of
+/// Arg.
+static CallInst *
+FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
+                                  Instruction *Autorelease,
+                                  SmallPtrSet<Instruction *, 4> &DepInsts,
+                                  SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                  ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg,
+                   BB, Autorelease, DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return 0;
+
+  CallInst *Retain =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+
+  // Check that we found a retain with the same argument.
+  if (!Retain ||
+      !IsRetain(GetBasicInstructionClass(Retain)) ||
+      GetObjCArg(Retain) != Arg) {
+    return 0;
+  }
+
+  return Retain;
+}
+
+/// Look for an ``autorelease'' instruction dependent on Arg such that there are
+/// no instructions dependent on Arg that need a positive ref count in between
+/// the autorelease and the ret.
+static CallInst *
+FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
+                                       ReturnInst *Ret,
+                                       SmallPtrSet<Instruction *, 4> &DepInsts,
+                                       SmallPtrSet<const BasicBlock *, 4> &V,
+                                       ProvenanceAnalysis &PA) {
+  FindDependencies(NeedsPositiveRetainCount, Arg,
+                   BB, Ret, DepInsts, V, PA);
+  if (DepInsts.size() != 1)
+    return 0;
+
+  CallInst *Autorelease =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  if (!Autorelease)
+    return 0;
+  InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
+  if (!IsAutorelease(AutoreleaseClass))
+    return 0;
+  if (GetObjCArg(Autorelease) != Arg)
+    return 0;
+
+  return Autorelease;
+}
+
 /// Look for this pattern:
 /// \code
 ///    %call = call i8* @something(...)
@@ -2484,122 +2911,91 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
 ///    ret i8* %3
 /// \endcode
 /// And delete the retain and autorelease.
-///
-/// Otherwise if it's just this:
-/// \code
-///    %3 = call i8* @objc_autorelease(i8* %2)
-///    ret i8* %3
-/// \endcode
-/// convert the autorelease to autoreleaseRV.
 void ObjCARCOpt::OptimizeReturns(Function &F) {
   if (!F.getReturnType()->isPointerTy())
     return;
 
+  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
+
   SmallPtrSet<Instruction *, 4> DependingInstructions;
   SmallPtrSet<const BasicBlock *, 4> Visited;
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
     BasicBlock *BB = FI;
     ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
 
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n");
+    DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
 
-    if (!Ret) continue;
+    if (!Ret)
+      continue;
 
     const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
-    FindDependencies(NeedsPositiveRetainCount, Arg,
-                     BB, Ret, DependingInstructions, Visited, PA);
-    if (DependingInstructions.size() != 1)
-      goto next_block;
-
-    {
-      CallInst *Autorelease =
-        dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-      if (!Autorelease)
-        goto next_block;
-      InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
-      if (!IsAutorelease(AutoreleaseClass))
-        goto next_block;
-      if (GetObjCArg(Autorelease) != Arg)
-        goto next_block;
-
-      DependingInstructions.clear();
-      Visited.clear();
-
-      // Check that there is nothing that can affect the reference
-      // count between the autorelease and the retain.
-      FindDependencies(CanChangeRetainCount, Arg,
-                       BB, Autorelease, DependingInstructions, Visited, PA);
-      if (DependingInstructions.size() != 1)
-        goto next_block;
-
-      {
-        CallInst *Retain =
-          dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-
-        // Check that we found a retain with the same argument.
-        if (!Retain ||
-            !IsRetain(GetBasicInstructionClass(Retain)) ||
-            GetObjCArg(Retain) != Arg)
-          goto next_block;
-
-        DependingInstructions.clear();
-        Visited.clear();
-
-        // Convert the autorelease to an autoreleaseRV, since it's
-        // returning the value.
-        if (AutoreleaseClass == IC_Autorelease) {
-          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Converting autorelease "
-                          "=> autoreleaseRV since it's returning a value.\n"
-                          "                             In: " << *Autorelease
-                       << "\n");
-          Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent()));
-          DEBUG(dbgs() << "                             Out: " << *Autorelease
-                       << "\n");
-          Autorelease->setTailCall(); // Always tail call autoreleaseRV.
-          AutoreleaseClass = IC_AutoreleaseRV;
-        }
-
-        // Check that there is nothing that can affect the reference
-        // count between the retain and the call.
-        // Note that Retain need not be in BB.
-        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
-                         DependingInstructions, Visited, PA);
-        if (DependingInstructions.size() != 1)
-          goto next_block;
 
-        {
-          CallInst *Call =
-            dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+    // Look for an ``autorelease'' instruction that is a predecessor of Ret and
+    // dependent on Arg such that there are no instructions dependent on Arg
+    // that need a positive ref count in between the autorelease and Ret.
+    CallInst *Autorelease =
+      FindPredecessorAutoreleaseWithSafePath(Arg, BB, Ret,
+                                             DependingInstructions, Visited,
+                                             PA);
+    DependingInstructions.clear();
+    Visited.clear();
 
-          // Check that the pointer is the return value of the call.
-          if (!Call || Arg != Call)
-            goto next_block;
+    if (!Autorelease)
+      continue;
 
-          // Check that the call is a regular call.
-          InstructionClass Class = GetBasicInstructionClass(Call);
-          if (Class != IC_CallOrUser && Class != IC_Call)
-            goto next_block;
+    CallInst *Retain =
+      FindPredecessorRetainWithSafePath(Arg, BB, Autorelease,
+                                        DependingInstructions, Visited, PA);
+    DependingInstructions.clear();
+    Visited.clear();
 
-          // If so, we can zap the retain and autorelease.
-          Changed = true;
-          ++NumRets;
-          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain
-                       << "\n                             Erasing: "
-                       << *Autorelease << "\n");
-          EraseInstruction(Retain);
-          EraseInstruction(Autorelease);
-        }
-      }
-    }
+    if (!Retain)
+      continue;
 
-  next_block:
+    // Check that there is nothing that can affect the reference count
+    // between the retain and the call.  Note that Retain need not be in BB.
+    bool HasSafePathToCall = HasSafePathToPredecessorCall(Arg, Retain,
+                                                          DependingInstructions,
+                                                          Visited, PA);
     DependingInstructions.clear();
     Visited.clear();
+
+    if (!HasSafePathToCall)
+      continue;
+
+    // If so, we can zap the retain and autorelease.
+    Changed = true;
+    ++NumRets;
+    DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: "
+          << *Autorelease << "\n");
+    EraseInstruction(Retain);
+    EraseInstruction(Autorelease);
   }
+}
 
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n");
+#ifndef NDEBUG
+void
+ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) {
+  llvm::Statistic &NumRetains =
+    AfterOptimization? NumRetainsAfterOpt : NumRetainsBeforeOpt;
+  llvm::Statistic &NumReleases =
+    AfterOptimization? NumReleasesAfterOpt : NumReleasesBeforeOpt;
 
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    switch (GetBasicInstructionClass(Inst)) {
+    default:
+      break;
+    case IC_Retain:
+      ++NumRetains;
+      break;
+    case IC_Release:
+      ++NumReleases;
+      break;
+    }
+  }
 }
+#endif
 
 bool ObjCARCOpt::doInitialization(Module &M) {
   if (!EnableARCOpts)
@@ -2617,13 +3013,20 @@ bool ObjCARCOpt::doInitialization(Module &M) {
     M.getContext().getMDKindID("clang.arc.copy_on_escape");
   NoObjCARCExceptionsMDKind =
     M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
+#ifdef ARC_ANNOTATIONS
+  ARCAnnotationBottomUpMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.bottomup");
+  ARCAnnotationTopDownMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.topdown");
+  ARCAnnotationProvenanceSourceMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.provenancesource");
+#endif // ARC_ANNOTATIONS
 
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
   // calls finalizers which can have arbitrary side effects.
 
   // These are initialized lazily.
-  RetainRVCallee = 0;
   AutoreleaseRVCallee = 0;
   ReleaseCallee = 0;
   RetainCallee = 0;
@@ -2643,7 +3046,8 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
 
   Changed = false;
 
-  DEBUG(dbgs() << "ObjCARCOpt: Visiting Function: " << F.getName() << "\n");
+  DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() << " >>>"
+        "\n");
 
   PA.setAA(&getAnalysis<AliasAnalysis>());
 
@@ -2651,7 +3055,7 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
   // when compiling code that isn't ObjC, skip these if the relevant ObjC
   // library functions aren't declared.
 
-  // Preliminary optimizations. This also computs UsedInThisFunction.
+  // Preliminary optimizations. This also computes UsedInThisFunction.
   OptimizeIndividualCalls(F);
 
   // Optimizations for weak pointers.
@@ -2678,6 +3082,13 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
                             (1 << IC_AutoreleaseRV)))
     OptimizeReturns(F);
 
+  // Gather statistics after optimization.
+#ifndef NDEBUG
+  if (AreStatisticsEnabled()) {
+    GatherStatistics(F, true);
+  }
+#endif
+
   DEBUG(dbgs() << "\n");
 
   return Changed;
diff --git a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
index a841c64..03e12d4 100644
--- a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
@@ -72,6 +72,8 @@ raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
     return OS << "IC_Call";
   case IC_User:
     return OS << "IC_User";
+  case IC_IntrinsicUser:
+    return OS << "IC_IntrinsicUser";
   case IC_None:
     return OS << "IC_None";
   }
@@ -81,10 +83,11 @@ raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
 InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
   Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
 
-  // No arguments.
+  // No (mandatory) arguments.
   if (AI == AE)
     return StringSwitch<InstructionClass>(F->getName())
       .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
+      .Case("clang.arc.use", IC_IntrinsicUser)
       .Default(IC_CallOrUser);
 
   // One argument.
@@ -142,6 +145,14 @@ InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
                 return StringSwitch<InstructionClass>(F->getName())
                   .Case("objc_moveWeak",              IC_MoveWeak)
                   .Case("objc_copyWeak",              IC_CopyWeak)
+                  // Ignore annotation calls. This is important to stop the
+                  // optimizer from treating annotations as uses which would
+                  // make the state of the pointers they are attempting to
+                  // elucidate to be incorrect.
+                  .Case("llvm.arc.annotation.topdown.bbstart", IC_None)
+                  .Case("llvm.arc.annotation.topdown.bbend", IC_None)
+                  .Case("llvm.arc.annotation.bottomup.bbstart", IC_None)
+                  .Case("llvm.arc.annotation.bottomup.bbend", IC_None)
                   .Default(IC_CallOrUser);
           }
 
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 015fd2e..615c517 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -1761,7 +1761,7 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
   if (!DefIsLiveOut)
     return false;
 
-  // Make sure non of the uses are PHI nodes.
+  // Make sure none of the uses are PHI nodes.
   for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
        UI != E; ++UI) {
     Instruction *User = cast<Instruction>(*UI);
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index 14e463a..4796eb2 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -61,10 +61,8 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -101,7 +99,7 @@ namespace {
     /// Collect every variables marked as "used"
     void collectUsedGlobalVariables(Module &M);
 
-    /// Keep track of the GlobalVariable that are marked as "used"
+    /// Keep track of the GlobalVariable that must not be merged away
     SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables;
 
   public:
@@ -113,6 +111,7 @@ namespace {
 
     virtual bool doInitialization(Module &M);
     virtual bool runOnFunction(Function &F);
+    virtual bool doFinalization(Module &M);
 
     const char *getPassName() const {
       return "Merge internal globals";
@@ -201,9 +200,8 @@ void GlobalMerge::collectUsedGlobalVariables(Module &M) {
   if (!GV || !GV->hasInitializer()) return;
 
   // Should be an array of 'i8*'.
-  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-  if (InitList == 0) return;
- 
+  const ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer());
+
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
     if (const GlobalVariable *G =
         dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts()))
@@ -211,9 +209,6 @@ void GlobalMerge::collectUsedGlobalVariables(Module &M) {
 }
 
 void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
-  // If we already processed a Module, UsedGlobalVariables may have been
-  // populated. Reset the information for this module.
-  MustKeepGlobalVariables.clear();
   collectUsedGlobalVariables(M);
 
   for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn;
@@ -268,8 +263,6 @@ bool GlobalMerge::doInitialization(Module &M) {
       continue;
 
     // Ignore all "required" globals:
-    // - the ones used for EH
-    // - the ones marked with "used" attribute
     if (isMustKeepGlobalVariable(I))
       continue;
 
@@ -307,6 +300,11 @@ bool GlobalMerge::runOnFunction(Function &F) {
   return false;
 }
 
+bool GlobalMerge::doFinalization(Module &M) {
+  MustKeepGlobalVariables.clear();
+  return false;
+}
+
 Pass *llvm::createGlobalMergePass(const TargetLowering *tli) {
   return new GlobalMerge(tli);
 }
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 97fff7e..8e76c78 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -535,6 +535,45 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         if (!SE->isLoopInvariant(ExitValue, L))
           continue;
 
+        // Computing the value outside of the loop brings no benefit if :
+        //  - it is definitely used inside the loop in a way which can not be
+        //    optimized away.
+        //  - no use outside of the loop can take advantage of hoisting the
+        //    computation out of the loop
+        if (ExitValue->getSCEVType()>=scMulExpr) {
+          unsigned NumHardInternalUses = 0;
+          unsigned NumSoftExternalUses = 0;
+          unsigned NumUses = 0;
+          for (Value::use_iterator IB=Inst->use_begin(), IE=Inst->use_end();
+               IB!=IE && NumUses<=6 ; ++IB) {
+            Instruction *UseInstr = cast<Instruction>(*IB);
+            unsigned Opc = UseInstr->getOpcode();
+            NumUses++;
+            if (L->contains(UseInstr)) {
+              if (Opc == Instruction::Call || Opc == Instruction::Ret)
+                NumHardInternalUses++;
+            } else {
+              if (Opc == Instruction::PHI) {
+                // Do not count the Phi as a use. LCSSA may have inserted
+                // plenty of trivial ones.
+                NumUses--;
+                for (Value::use_iterator PB=UseInstr->use_begin(),
+                                         PE=UseInstr->use_end();
+                     PB!=PE && NumUses<=6 ; ++PB, ++NumUses) {
+                  unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode();
+                  if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret)
+                    NumSoftExternalUses++;
+                }
+                continue;
+              }
+              if (Opc != Instruction::Call && Opc != Instruction::Ret)
+                NumSoftExternalUses++;
+            }
+          }
+          if (NumUses <= 6 && NumHardInternalUses && !NumSoftExternalUses)
+            continue;
+        }
+
         Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
 
         DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 9c67e32..0b62050 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -34,13 +34,9 @@ namespace {
     }
 
     // Possibly eliminate loop L if it is dead.
-    bool runOnLoop(Loop* L, LPPassManager& LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
 
-    bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks,
-                    SmallVector<BasicBlock*, 4>& exitBlocks,
-                    bool &Changed, BasicBlock *Preheader);
-
-    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<DominatorTree>();
       AU.addRequired<LoopInfo>();
       AU.addRequired<ScalarEvolution>();
@@ -53,6 +49,12 @@ namespace {
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
     }
+
+  private:
+    bool isLoopDead(Loop *L, SmallVector<BasicBlock*, 4> &exitingBlocks,
+                    SmallVector<BasicBlock*, 4> &exitBlocks,
+                    bool &Changed, BasicBlock *Preheader);
+
   };
 }
 
@@ -67,18 +69,18 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopDeletion, "loop-deletion",
                 "Delete dead loops", false, false)
 
-Pass* llvm::createLoopDeletionPass() {
+Pass *llvm::createLoopDeletionPass() {
   return new LoopDeletion();
 }
 
-/// IsLoopDead - Determined if a loop is dead.  This assumes that we've already
+/// isLoopDead - Determined if a loop is dead.  This assumes that we've already
 /// checked for unique exit and exiting blocks, and that the code is in LCSSA
 /// form.
-bool LoopDeletion::IsLoopDead(Loop* L,
-                              SmallVector<BasicBlock*, 4>& exitingBlocks,
-                              SmallVector<BasicBlock*, 4>& exitBlocks,
+bool LoopDeletion::isLoopDead(Loop *L,
+                              SmallVector<BasicBlock*, 4> &exitingBlocks,
+                              SmallVector<BasicBlock*, 4> &exitBlocks,
                               bool &Changed, BasicBlock *Preheader) {
-  BasicBlock* exitBlock = exitBlocks[0];
+  BasicBlock *exitBlock = exitBlocks[0];
 
   // Make sure that all PHI entries coming from the loop are loop invariant.
   // Because the code is in LCSSA form, any values used outside of the loop
@@ -86,19 +88,19 @@ bool LoopDeletion::IsLoopDead(Loop* L,
   // sufficient to guarantee that no loop-variant values are used outside
   // of the loop.
   BasicBlock::iterator BI = exitBlock->begin();
-  while (PHINode* P = dyn_cast<PHINode>(BI)) {
-    Value* incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
 
     // Make sure all exiting blocks produce the same incoming value for the exit
     // block.  If there are different incoming values for different exiting
     // blocks, then it is impossible to statically determine which value should
     // be used.
-    for (unsigned i = 1; i < exitingBlocks.size(); ++i) {
+    for (unsigned i = 1, e = exitingBlocks.size(); i < e; ++i) {
       if (incoming != P->getIncomingValueForBlock(exitingBlocks[i]))
         return false;
     }
 
-    if (Instruction* I = dyn_cast<Instruction>(incoming))
+    if (Instruction *I = dyn_cast<Instruction>(incoming))
       if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
         return false;
 
@@ -127,10 +129,10 @@ bool LoopDeletion::IsLoopDead(Loop* L,
 /// so could change the halting/non-halting nature of a program.
 /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
 /// in order to make various safety checks work.
-bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
+bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
   // We can only remove the loop if there is a preheader that we can
   // branch from after removing it.
-  BasicBlock* preheader = L->getLoopPreheader();
+  BasicBlock *preheader = L->getLoopPreheader();
   if (!preheader)
     return false;
 
@@ -158,19 +160,19 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
 
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
-  if (!IsLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
+  if (!isLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
     return Changed;
 
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
-  ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S))
     return Changed;
 
   // Now that we know the removal is safe, remove the loop by changing the
   // branch from the preheader to go to the single exit block.
-  BasicBlock* exitBlock = exitBlocks[0];
+  BasicBlock *exitBlock = exitBlocks[0];
 
   // Because we're deleting a large chunk of code at once, the sequence in which
   // we remove things is very important to avoid invalidation issues.  Don't
@@ -182,14 +184,14 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   SE.forgetLoop(L);
 
   // Connect the preheader directly to the exit block.
-  TerminatorInst* TI = preheader->getTerminator();
+  TerminatorInst *TI = preheader->getTerminator();
   TI->replaceUsesOfWith(L->getHeader(), exitBlock);
 
   // Rewrite phis in the exit block to get their inputs from
   // the preheader instead of the exiting block.
-  BasicBlock* exitingBlock = exitingBlocks[0];
+  BasicBlock *exitingBlock = exitingBlocks[0];
   BasicBlock::iterator BI = exitBlock->begin();
-  while (PHINode* P = dyn_cast<PHINode>(BI)) {
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
     int j = P->getBasicBlockIndex(exitingBlock);
     assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
     P->setIncomingBlock(j, preheader);
@@ -200,7 +202,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
 
   // Update the dominator tree and remove the instructions and blocks that will
   // be deleted from the reference counting scheme.
-  DominatorTree& DT = getAnalysis<DominatorTree>();
+  DominatorTree &DT = getAnalysis<DominatorTree>();
   SmallVector<DomTreeNode*, 8> ChildNodes;
   for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
        LI != LE; ++LI) {
@@ -230,7 +232,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
 
   // Finally, the blocks from loopinfo.  This has to happen late because
   // otherwise our loop iterators won't work.
-  LoopInfo& loopInfo = getAnalysis<LoopInfo>();
+  LoopInfo &loopInfo = getAnalysis<LoopInfo>();
   SmallPtrSet<BasicBlock*, 8> blocks;
   blocks.insert(L->block_begin(), L->block_end());
   for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 4e4cb86..73e44d7 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -895,7 +895,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
   }
   if (Regs.insert(Reg)) {
     RateRegister(Reg, Regs, L, SE, DT);
-    if (isLoser())
+    if (LoserRegs && isLoser())
       LoserRegs->insert(Reg);
   }
 }
@@ -1895,15 +1895,13 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   if (ICmpInst::isTrueWhenEqual(Pred)) {
     // Look for n+1, and grab n.
     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
-      if (isa<ConstantInt>(BO->getOperand(1)) &&
-          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
-          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
-        NewRHS = BO->getOperand(0);
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+           NewRHS = BO->getOperand(0);
     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
-      if (isa<ConstantInt>(BO->getOperand(1)) &&
-          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
-          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
-        NewRHS = BO->getOperand(0);
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+          NewRHS = BO->getOperand(0);
     if (!NewRHS)
       return Cond;
   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
@@ -2716,6 +2714,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   // by LSR.
   const IVInc &Head = Chain.Incs[0];
   User::op_iterator IVOpEnd = Head.UserInst->op_end();
+  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
                                              IVOpEnd, L, SE);
   Value *IVSrc = 0;
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 0da3746..a3c241d 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -110,6 +110,51 @@ namespace {
       }
     };
   };
+  
+  /// Utility class representing a non-constant Xor-operand. We classify
+  /// non-constant Xor-Operands into two categories:
+  ///  C1) The operand is in the form "X & C", where C is a constant and C != ~0
+  ///  C2)
+  ///    C2.1) The operand is in the form of "X | C", where C is a non-zero
+  ///          constant.
+  ///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+  ///          operand as "E | 0"
+  class XorOpnd {
+  public:
+    XorOpnd(Value *V);
+    const XorOpnd &operator=(const XorOpnd &That);
+
+    bool isInvalid() const { return SymbolicPart == 0; }
+    bool isOrExpr() const { return isOr; }
+    Value *getValue() const { return OrigVal; }
+    Value *getSymbolicPart() const { return SymbolicPart; }
+    unsigned getSymbolicRank() const { return SymbolicRank; }
+    const APInt &getConstPart() const { return ConstPart; }
+
+    void Invalidate() { SymbolicPart = OrigVal = 0; }
+    void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+    // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank.
+    // The purpose is twofold:
+    // 1) Cluster together the operands sharing the same symbolic-value.
+    // 2) Operand having smaller symbolic-value-rank is permuted earlier, which 
+    //   could potentially shorten crital path, and expose more loop-invariants.
+    //   Note that values' rank are basically defined in RPO order (FIXME). 
+    //   So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier 
+    //   than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+    //   "z" in the order of X-Y-Z is better than any other orders.
+    struct PtrSortFunctor {
+      bool operator()(XorOpnd * const &LHS, XorOpnd * const &RHS) {
+        return LHS->getSymbolicRank() < RHS->getSymbolicRank();
+      }
+    };
+  private:
+    Value *OrigVal;
+    Value *SymbolicPart;
+    APInt ConstPart;
+    unsigned SymbolicRank;
+    bool isOr;
+  };
 }
 
 namespace {
@@ -137,6 +182,11 @@ namespace {
     Value *OptimizeExpression(BinaryOperator *I,
                               SmallVectorImpl<ValueEntry> &Ops);
     Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeXor(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd,
+                        Value *&Res);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                        APInt &ConstOpnd, Value *&Res);
     bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
                                 SmallVectorImpl<Factor> &Factors);
     Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder,
@@ -148,6 +198,42 @@ namespace {
   };
 }
 
+XorOpnd::XorOpnd(Value *V) {
+  assert(!isa<ConstantInt>(V) && "No ConstantInt");
+  OrigVal = V;
+  Instruction *I = dyn_cast<Instruction>(V);
+  SymbolicRank = 0;
+
+  if (I && (I->getOpcode() == Instruction::Or ||
+            I->getOpcode() == Instruction::And)) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (isa<ConstantInt>(V0))
+      std::swap(V0, V1);
+
+    if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
+      ConstPart = C->getValue();
+      SymbolicPart = V0;
+      isOr = (I->getOpcode() == Instruction::Or);
+      return;
+    }
+  }
+
+  // view the operand as "V | 0"
+  SymbolicPart = V;
+  ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
+  isOr = true;
+}
+
+const XorOpnd &XorOpnd::operator=(const XorOpnd &That) {
+  OrigVal = That.OrigVal;
+  SymbolicPart = That.SymbolicPart;
+  ConstPart = That.ConstPart;
+  SymbolicRank = That.SymbolicRank;
+  isOr = That.isOr;
+  return *this;
+}
+
 char Reassociate::ID = 0;
 INITIALIZE_PASS(Reassociate, "reassociate",
                 "Reassociate expressions", false, false)
@@ -1040,6 +1126,250 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
   return 0;
 }
 
+/// Helper funciton of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, 
+                             const APInt &ConstOpnd) {
+  if (ConstOpnd != 0) {
+    if (!ConstOpnd.isAllOnesValue()) {
+      LLVMContext &Ctx = Opnd->getType()->getContext();
+      Instruction *I;
+      I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
+                                    "and.ra", InsertBefore);
+      I->setDebugLoc(InsertBefore->getDebugLoc());
+      return I;
+    }
+    return Opnd;
+  }
+  return 0;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+//  
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                 APInt &ConstOpnd, Value *&Res) {
+  // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 
+  //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
+  //                       = (x & ~c1) ^ (c1 ^ c2)
+  // It is useful only when c1 == c2.
+  if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
+    if (!Opnd1->getValue()->hasOneUse())
+      return false;
+
+    const APInt &C1 = Opnd1->getConstPart();
+    if (C1 != ConstOpnd)
+      return false;
+
+    Value *X = Opnd1->getSymbolicPart();
+    Res = createAndInstr(I, X, ~C1);
+    // ConstOpnd was C2, now C1 ^ C2.
+    ConstOpnd ^= C1;
+
+    if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+      RedoInsts.insert(T);
+    return true;
+  }
+  return false;
+}
+
+                           
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value. 
+// 
+// If it was successful, true is returned, and the "R" and "C" is returned 
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                                 APInt &ConstOpnd, Value *&Res) {
+  Value *X = Opnd1->getSymbolicPart();
+  if (X != Opnd2->getSymbolicPart())
+    return false;
+
+  // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+  int DeadInstNum = 1;
+  if (Opnd1->getValue()->hasOneUse())
+    DeadInstNum++;
+  if (Opnd2->getValue()->hasOneUse())
+    DeadInstNum++;
+
+  // Xor-Rule 2:
+  //  (x | c1) ^ (x & c2)
+  //   = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+  //   = (x & ~c1) ^ (x & c2) ^ c1               // Xor-Rule 1
+  //   = (x & c3) ^ c1, where c3 = ~c1 ^ c2      // Xor-rule 3
+  //
+  if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+    if (Opnd2->isOrExpr())
+      std::swap(Opnd1, Opnd2);
+
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3((~C1) ^ C2);
+
+    // Do not increase code size!
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C1;
+
+  } else if (Opnd1->isOrExpr()) {
+    // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+    
+    // Do not increase code size
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C3;
+  } else {
+    // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+    Res = createAndInstr(I, X, C3);
+  }
+
+  // Put the original operands in the Redo list; hope they will be deleted
+  // as dead code.
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+    RedoInsts.insert(T);
+
+  return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *Reassociate::OptimizeXor(Instruction *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+    return V;
+      
+  if (Ops.size() == 1)
+    return 0;
+
+  SmallVector<XorOpnd, 8> Opnds;
+  SmallVector<XorOpnd*, 8> OpndPtrs;
+  Type *Ty = Ops[0].Op->getType();
+  APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
+
+  // Step 1: Convert ValueEntry to XorOpnd
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *V = Ops[i].Op;
+    if (!isa<ConstantInt>(V)) {
+      XorOpnd O(V);
+      O.setSymbolicRank(getRank(O.getSymbolicPart()));
+      Opnds.push_back(O);
+    } else
+      ConstOpnd ^= cast<ConstantInt>(V)->getValue();
+  }
+
+  // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
+  //  It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
+  //  the "OpndPtrs" as well. For the similar reason, do not fuse this loop
+  //  with the previous loop --- the iterator of the "Opnds" may be invalidated
+  //  when new elements are added to the vector.
+  for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
+    OpndPtrs.push_back(&Opnds[i]);
+
+  // Step 2: Sort the Xor-Operands in a way such that the operands containing
+  //  the same symbolic value cluster together. For instance, the input operand
+  //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+  //  ("x | 123", "x & 789", "y & 456").
+  std::sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor());
+
+  // Step 3: Combine adjacent operands
+  XorOpnd *PrevOpnd = 0;
+  bool Changed = false;
+  for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+    XorOpnd *CurrOpnd = OpndPtrs[i];
+    // The combined value
+    Value *CV;
+
+    // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+    if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+      Changed = true;
+      if (CV)
+        *CurrOpnd = XorOpnd(CV);
+      else {
+        CurrOpnd->Invalidate();
+        continue;
+      }
+    }
+
+    if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+      PrevOpnd = CurrOpnd;
+      continue;
+    }
+
+    // step 3.2: When previous and current operands share the same symbolic
+    //  value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" 
+    //    
+    if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+      // Remove previous operand
+      PrevOpnd->Invalidate();
+      if (CV) {
+        *CurrOpnd = XorOpnd(CV);
+        PrevOpnd = CurrOpnd;
+      } else {
+        CurrOpnd->Invalidate();
+        PrevOpnd = 0;
+      }
+      Changed = true;
+    }
+  }
+
+  // Step 4: Reassemble the Ops
+  if (Changed) {
+    Ops.clear();
+    for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+      XorOpnd &O = Opnds[i];
+      if (O.isInvalid())
+        continue;
+      ValueEntry VE(getRank(O.getValue()), O.getValue());
+      Ops.push_back(VE);
+    }
+    if (ConstOpnd != 0) {
+      Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
+      ValueEntry VE(getRank(C), C);
+      Ops.push_back(VE);
+    }
+    int Sz = Ops.size();
+    if (Sz == 1)
+      return Ops.back().Op;
+    else if (Sz == 0) {
+      assert(ConstOpnd == 0);
+      return ConstantInt::get(Ty->getContext(), ConstOpnd);
+    }
+  }
+
+  return 0;
+}
+
 /// OptimizeAdd - Optimize a series of operands to an 'add' instruction.  This
 /// optimizes based on identities.  If it can be reduced to a single Value, it
 /// is returned, otherwise the Ops list is mutated as necessary.
@@ -1431,11 +1761,15 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
   default: break;
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
     if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
       return Result;
     break;
 
+  case Instruction::Xor:
+    if (Value *Result = OptimizeXor(I, Ops))
+      return Result;
+    break;
+
   case Instruction::Add:
     if (Value *Result = OptimizeAdd(I, Ops))
       return Result;
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 0e57e5c..d073e78 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -57,11 +57,15 @@
 using namespace llvm;
 
 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
-STATISTIC(NumNewAllocas,      "Number of new, smaller allocas introduced");
-STATISTIC(NumPromoted,        "Number of allocas promoted to SSA values");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found");
+STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
-STATISTIC(NumDeleted,         "Number of instructions deleted");
-STATISTIC(NumVectorized,      "Number of vectorized aggregates");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
 
 /// Hidden option to force the pass to not use DomTree and mem2reg, instead
 /// forming SSA values through the SSAUpdater infrastructure.
@@ -69,6 +73,43 @@ static cl::opt<bool>
 ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
 
 namespace {
+/// \brief A custom IRBuilder inserter which prefixes all names if they are
+/// preserved.
+template <bool preserveNames = true>
+class IRBuilderPrefixedInserter :
+    public IRBuilderDefaultInserter<preserveNames> {
+  std::string Prefix;
+
+public:
+  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
+
+protected:
+  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+                    BasicBlock::iterator InsertPt) const {
+    IRBuilderDefaultInserter<preserveNames>::InsertHelper(
+        I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt);
+  }
+};
+
+// Specialization for not preserving the name is trivial.
+template <>
+class IRBuilderPrefixedInserter<false> :
+    public IRBuilderDefaultInserter<false> {
+public:
+  void SetNamePrefix(const Twine &P) {}
+};
+
+/// \brief Provide a typedef for IRBuilder that drops names in release builds.
+#ifndef NDEBUG
+typedef llvm::IRBuilder<true, ConstantFolder,
+                        IRBuilderPrefixedInserter<true> > IRBuilderTy;
+#else
+typedef llvm::IRBuilder<false, ConstantFolder,
+                        IRBuilderPrefixedInserter<false> > IRBuilderTy;
+#endif
+}
+
+namespace {
 /// \brief A common base class for representing a half-open byte range.
 struct ByteRange {
   /// \brief The beginning offset of the range.
@@ -888,7 +929,7 @@ private:
     uint64_t Size = Length ? Length->getLimitedValue()
                            : AllocSize - Offset.getLimitedValue();
 
-    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
+    const MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
     if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd &&
         Offsets.DestBegin == Offsets.SourceBegin)
       return markAsDead(II); // Skip identity transfers without side-effects.
@@ -1097,6 +1138,10 @@ AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
     splitAndMergePartitions();
   }
 
+  // Record how many partitions we end up with.
+  NumAllocaPartitions += Partitions.size();
+  MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca);
+
   // Now build up the user lists for each of these disjoint partitions by
   // re-walking the recursive users of the alloca.
   Uses.resize(Partitions.size());
@@ -1104,6 +1149,14 @@ AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
   PtrI = UB.visitPtr(AI);
   assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!");
   assert(!PtrI.isAborted() && "Early aborted the visit of the pointer.");
+
+  unsigned NumUses = 0;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
+  for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx)
+    NumUses += Uses[Idx].size();
+#endif
+  NumAllocaPartitionUses += NumUses;
+  MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca);
 }
 
 Type *AllocaPartitioning::getCommonType(iterator I) const {
@@ -1265,12 +1318,12 @@ public:
         // may be zapped by an optimization pass in future.
         if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
           Arg = dyn_cast<Argument>(ZExt->getOperand(0));
-        if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+        else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
           Arg = dyn_cast<Argument>(SExt->getOperand(0));
         if (!Arg)
-          Arg = SI->getOperand(0);
+          Arg = SI->getValueOperand();
       } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Arg = LI->getOperand(0);
+        Arg = LI->getPointerOperand();
       } else {
         continue;
       }
@@ -1494,7 +1547,7 @@ private:
     assert(!Loads.empty());
 
     Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
-    IRBuilder<> PHIBuilder(&PN);
+    IRBuilderTy PHIBuilder(&PN);
     PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
                                           PN.getName() + ".sroa.speculated");
 
@@ -1517,7 +1570,7 @@ private:
       TerminatorInst *TI = Pred->getTerminator();
       Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
       Value *InVal = PN.getIncomingValue(Idx);
-      IRBuilder<> PredBuilder(TI);
+      IRBuilderTy PredBuilder(TI);
 
       LoadInst *Load
         = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
@@ -1598,7 +1651,7 @@ private:
     if (!isSafeSelectToSpeculate(SI, Loads))
       return;
 
-    IRBuilder<> IRB(&SI);
+    IRBuilderTy IRB(&SI);
     Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
     AllocaPartitioning::iterator PIs[2];
     PartitionUse PUs[2];
@@ -1662,9 +1715,8 @@ private:
 ///
 /// This will return the BasePtr if that is valid, or build a new GEP
 /// instruction using the IRBuilder if GEP-ing is needed.
-static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
-                       SmallVectorImpl<Value *> &Indices,
-                       const Twine &Prefix) {
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
+                       SmallVectorImpl<Value *> &Indices) {
   if (Indices.empty())
     return BasePtr;
 
@@ -1673,7 +1725,7 @@ static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
   if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
     return BasePtr;
 
-  return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx");
+  return IRB.CreateInBoundsGEP(BasePtr, Indices, "idx");
 }
 
 /// \brief Get a natural GEP off of the BasePtr walking through Ty toward
@@ -1685,12 +1737,11 @@ static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
 /// TargetTy. If we can't find one with the same type, we at least try to use
 /// one with the same size. If none of that works, we just produce the GEP as
 /// indicated by Indices to have the correct offset.
-static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
                                     Value *BasePtr, Type *Ty, Type *TargetTy,
-                                    SmallVectorImpl<Value *> &Indices,
-                                    const Twine &Prefix) {
+                                    SmallVectorImpl<Value *> &Indices) {
   if (Ty == TargetTy)
-    return buildGEP(IRB, BasePtr, Indices, Prefix);
+    return buildGEP(IRB, BasePtr, Indices);
 
   // See if we can descend into a struct and locate a field with the correct
   // type.
@@ -1717,20 +1768,19 @@ static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD,
   if (ElementTy != TargetTy)
     Indices.erase(Indices.end() - NumLayers, Indices.end());
 
-  return buildGEP(IRB, BasePtr, Indices, Prefix);
+  return buildGEP(IRB, BasePtr, Indices);
 }
 
 /// \brief Recursively compute indices for a natural GEP.
 ///
 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
 /// element types adding appropriate indices for the GEP.
-static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
                                        Value *Ptr, Type *Ty, APInt &Offset,
                                        Type *TargetTy,
-                                       SmallVectorImpl<Value *> &Indices,
-                                       const Twine &Prefix) {
+                                       SmallVectorImpl<Value *> &Indices) {
   if (Offset == 0)
-    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix);
+    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices);
 
   // We can't recurse through pointer types.
   if (Ty->isPointerTy())
@@ -1750,7 +1800,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
     return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
-                                    Offset, TargetTy, Indices, Prefix);
+                                    Offset, TargetTy, Indices);
   }
 
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
@@ -1763,7 +1813,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
     return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
-                                    Indices, Prefix);
+                                    Indices);
   }
 
   StructType *STy = dyn_cast<StructType>(Ty);
@@ -1782,7 +1832,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
 
   Indices.push_back(IRB.getInt32(Index));
   return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices, Prefix);
+                                  Indices);
 }
 
 /// \brief Get a natural GEP from a base pointer to a particular offset and
@@ -1795,10 +1845,9 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
 /// Indices, and setting Ty to the result subtype.
 ///
 /// If no natural GEP can be constructed, this function returns null.
-static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
                                       Value *Ptr, APInt Offset, Type *TargetTy,
-                                      SmallVectorImpl<Value *> &Indices,
-                                      const Twine &Prefix) {
+                                      SmallVectorImpl<Value *> &Indices) {
   PointerType *Ty = cast<PointerType>(Ptr->getType());
 
   // Don't consider any GEPs through an i8* as natural unless the TargetTy is
@@ -1817,7 +1866,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
   Offset -= NumSkippedElements * ElementSize;
   Indices.push_back(IRB.getInt(NumSkippedElements));
   return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices, Prefix);
+                                  Indices);
 }
 
 /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
@@ -1835,9 +1884,8 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
 /// properties. The algorithm tries to fold as many constant indices into
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
-static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
-                             Value *Ptr, APInt Offset, Type *PointerTy,
-                             const Twine &Prefix) {
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
+                             Value *Ptr, APInt Offset, Type *PointerTy) {
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
@@ -1871,7 +1919,7 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
     // See if we can perform a natural GEP here.
     Indices.clear();
     if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
-                                           Indices, Prefix)) {
+                                           Indices)) {
       if (P->getType() == PointerTy) {
         // Zap any offset pointer that we ended up computing in previous rounds.
         if (OffsetPtr && OffsetPtr->use_empty())
@@ -1906,19 +1954,19 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
   if (!OffsetPtr) {
     if (!Int8Ptr) {
       Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
-                                  Prefix + ".raw_cast");
+                                  "raw_cast");
       Int8PtrOffset = Offset;
     }
 
     OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
       IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
-                            Prefix + ".raw_idx");
+                            "raw_idx");
   }
   Ptr = OffsetPtr;
 
   // On the off chance we were targeting i8*, guard the bitcast here.
   if (Ptr->getType() != PointerTy)
-    Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast");
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, "cast");
 
   return Ptr;
 }
@@ -1958,7 +2006,7 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
 /// This will try various different casting techniques, such as bitcasts,
 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
 /// two types for viability with this routine.
-static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                            Type *Ty) {
   assert(canConvertValue(DL, V->getType(), Ty) &&
          "Value not convertable to type");
@@ -2156,7 +2204,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
   return WholeAllocaOp;
 }
 
-static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                              IntegerType *Ty, uint64_t Offset,
                              const Twine &Name) {
   DEBUG(dbgs() << "       start: " << *V << "\n");
@@ -2179,7 +2227,7 @@ static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
   return V;
 }
 
-static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
                             Value *V, uint64_t Offset, const Twine &Name) {
   IntegerType *IntTy = cast<IntegerType>(Old->getType());
   IntegerType *Ty = cast<IntegerType>(V->getType());
@@ -2210,7 +2258,7 @@ static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
   return V;
 }
 
-static Value *extractVector(IRBuilder<> &IRB, Value *V,
+static Value *extractVector(IRBuilderTy &IRB, Value *V,
                             unsigned BeginIndex, unsigned EndIndex,
                             const Twine &Name) {
   VectorType *VecTy = cast<VectorType>(V->getType());
@@ -2238,7 +2286,7 @@ static Value *extractVector(IRBuilder<> &IRB, Value *V,
   return V;
 }
 
-static Value *insertVector(IRBuilder<> &IRB, Value *Old, Value *V,
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
                            unsigned BeginIndex, const Twine &Name) {
   VectorType *VecTy = cast<VectorType>(Old->getType());
   assert(VecTy && "Can only insert a vector into a vector");
@@ -2274,17 +2322,15 @@ static Value *insertVector(IRBuilder<> &IRB, Value *Old, Value *V,
   V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
                               ConstantVector::get(Mask),
                               Name + ".expand");
-  DEBUG(dbgs() << "    shuffle1: " << *V << "\n");
+  DEBUG(dbgs() << "    shuffle: " << *V << "\n");
 
   Mask.clear();
   for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
-    if (i >= BeginIndex && i < EndIndex)
-      Mask.push_back(IRB.getInt32(i));
-    else
-      Mask.push_back(IRB.getInt32(i + VecTy->getNumElements()));
-  V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask),
-                              Name + "insert");
-  DEBUG(dbgs() << "    shuffle2: " << *V << "\n");
+    Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
+
+  V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
+
+  DEBUG(dbgs() << "    blend: " << *V << "\n");
   return V;
 }
 
@@ -2332,8 +2378,9 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
   Use *OldUse;
   Instruction *OldPtr;
 
-  // The name prefix to use when rewriting instructions for this alloca.
-  std::string NamePrefix;
+  // Utility IR builder, whose name prefix is setup for each visited use, and
+  // the insertion point is set to point to the user.
+  IRBuilderTy IRB;
 
 public:
   AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P,
@@ -2346,7 +2393,8 @@ public:
       NewAllocaEndOffset(NewEndOffset),
       NewAllocaTy(NewAI.getAllocatedType()),
       VecTy(), ElementTy(), ElementSize(), IntTy(),
-      BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr() {
+      BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr(),
+      IRB(NewAI.getContext(), ConstantFolder()) {
   }
 
   /// \brief Visit the users of the alloca partition and rewrite them.
@@ -2375,7 +2423,13 @@ public:
       IsSplit = I->isSplit();
       OldUse = I->getUse();
       OldPtr = cast<Instruction>(OldUse->get());
-      NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str();
+
+      Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+      IRB.SetInsertPoint(OldUserI);
+      IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+      IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
+                        ".");
+
       CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
     }
     if (VecTy) {
@@ -2398,14 +2452,10 @@ private:
     llvm_unreachable("No rewrite rule for this instruction!");
   }
 
-  Twine getName(const Twine &Suffix) {
-    return NamePrefix + Suffix;
-  }
-
-  Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) {
+  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) {
     assert(BeginOffset >= NewAllocaBeginOffset);
     APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
-    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName(""));
+    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy);
   }
 
   /// \brief Compute suitable alignment to access an offset into the new alloca.
@@ -2455,27 +2505,27 @@ private:
       Pass.DeadInsts.insert(I);
   }
 
-  Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB) {
+  Value *rewriteVectorizedLoadInst() {
     unsigned BeginIndex = getIndex(BeginOffset);
     unsigned EndIndex = getIndex(EndOffset);
     assert(EndIndex > BeginIndex && "Empty vector!");
 
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     getName(".load"));
-    return extractVector(IRB, V, BeginIndex, EndIndex, getName(".vec"));
+                                     "load");
+    return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   }
 
-  Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) {
+  Value *rewriteIntegerLoad(LoadInst &LI) {
     assert(IntTy && "We cannot insert an integer to the alloca");
     assert(!LI.isVolatile());
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     getName(".load"));
+                                     "load");
     V = convertValue(TD, IRB, V, IntTy);
     assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
     uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
     if (Offset > 0 || EndOffset < NewAllocaEndOffset)
       V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
-                         getName(".extract"));
+                         "extract");
     return V;
   }
 
@@ -2486,24 +2536,23 @@ private:
 
     uint64_t Size = EndOffset - BeginOffset;
 
-    IRBuilder<> IRB(&LI);
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8)
                              : LI.getType();
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
-      V = rewriteVectorizedLoadInst(IRB);
+      V = rewriteVectorizedLoadInst();
     } else if (IntTy && LI.getType()->isIntegerTy()) {
-      V = rewriteIntegerLoad(IRB, LI);
+      V = rewriteIntegerLoad(LI);
     } else if (BeginOffset == NewAllocaBeginOffset &&
                canConvertValue(TD, NewAllocaTy, LI.getType())) {
       V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                LI.isVolatile(), getName(".load"));
+                                LI.isVolatile(), "load");
     } else {
       Type *LTy = TargetTy->getPointerTo();
       V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy),
                                 getPartitionTypeAlign(TargetTy),
-                                LI.isVolatile(), getName(".load"));
+                                LI.isVolatile(), "load");
       IsPtrAdjusted = true;
     }
     V = convertValue(TD, IRB, V, TargetTy);
@@ -2526,7 +2575,7 @@ private:
       Value *Placeholder
         = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
       V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
-                        getName(".insert"));
+                        "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
       delete Placeholder;
@@ -2540,7 +2589,7 @@ private:
     return !LI.isVolatile() && !IsPtrAdjusted;
   }
 
-  bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V,
+  bool rewriteVectorizedStoreInst(Value *V,
                                   StoreInst &SI, Value *OldOp) {
     unsigned BeginIndex = getIndex(BeginOffset);
     unsigned EndIndex = getIndex(EndOffset);
@@ -2555,8 +2604,8 @@ private:
 
     // Mix in the existing elements.
     Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                       getName(".load"));
-    V = insertVector(IRB, Old, V, BeginIndex, getName(".vec"));
+                                       "load");
+    V = insertVector(IRB, Old, V, BeginIndex, "vec");
 
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Pass.DeadInsts.insert(&SI);
@@ -2566,17 +2615,17 @@ private:
     return true;
   }
 
-  bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) {
+  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
     if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
+                                         "oldload");
       Old = convertValue(TD, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
       V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
-                        getName(".insert"));
+                        "insert");
     }
     V = convertValue(TD, IRB, V, NewAllocaTy);
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2590,7 +2639,6 @@ private:
     DEBUG(dbgs() << "    original: " << SI << "\n");
     Value *OldOp = SI.getOperand(1);
     assert(OldOp == OldPtr);
-    IRBuilder<> IRB(&SI);
 
     Value *V = SI.getValueOperand();
 
@@ -2611,16 +2659,17 @@ private:
              "Non-byte-multiple bit width");
       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
       V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
-                         getName(".extract"));
+                         "extract");
     }
 
     if (VecTy)
-      return rewriteVectorizedStoreInst(IRB, V, SI, OldOp);
+      return rewriteVectorizedStoreInst(V, SI, OldOp);
     if (IntTy && V->getType()->isIntegerTy())
-      return rewriteIntegerStore(IRB, V, SI);
+      return rewriteIntegerStore(V, SI);
 
     StoreInst *NewSI;
     if (BeginOffset == NewAllocaBeginOffset &&
+        EndOffset == NewAllocaEndOffset &&
         canConvertValue(TD, V->getType(), NewAllocaTy)) {
       V = convertValue(TD, IRB, V, NewAllocaTy);
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
@@ -2648,7 +2697,7 @@ private:
   ///
   /// \param V The i8 value to splat.
   /// \param Size The number of bytes in the output (assuming i8 is one byte)
-  Value *getIntegerSplat(IRBuilder<> &IRB, Value *V, unsigned Size) {
+  Value *getIntegerSplat(Value *V, unsigned Size) {
     assert(Size > 0 && "Expected a positive number of bytes.");
     IntegerType *VTy = cast<IntegerType>(V->getType());
     assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
@@ -2656,26 +2705,25 @@ private:
       return V;
 
     Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
-    V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")),
+    V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"),
                       ConstantExpr::getUDiv(
                         Constant::getAllOnesValue(SplatIntTy),
                         ConstantExpr::getZExt(
                           Constant::getAllOnesValue(V->getType()),
                           SplatIntTy)),
-                      getName(".isplat"));
+                      "isplat");
     return V;
   }
 
   /// \brief Compute a vector splat for a given element value.
-  Value *getVectorSplat(IRBuilder<> &IRB, Value *V, unsigned NumElements) {
-    V = IRB.CreateVectorSplat(NumElements, V, NamePrefix);
+  Value *getVectorSplat(Value *V, unsigned NumElements) {
+    V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
     DEBUG(dbgs() << "       splat: " << *V << "\n");
     return V;
   }
 
   bool visitMemSetInst(MemSetInst &II) {
     DEBUG(dbgs() << "    original: " << II << "\n");
-    IRBuilder<> IRB(&II);
     assert(II.getRawDest() == OldPtr);
 
     // If the memset has a variable size, it cannot be split, just adjust the
@@ -2732,31 +2780,31 @@ private:
       unsigned NumElements = EndIndex - BeginIndex;
       assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
 
-      Value *Splat = getIntegerSplat(IRB, II.getValue(),
-                                     TD.getTypeSizeInBits(ElementTy)/8);
+      Value *Splat =
+          getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ElementTy) / 8);
       Splat = convertValue(TD, IRB, Splat, ElementTy);
       if (NumElements > 1)
-        Splat = getVectorSplat(IRB, Splat, NumElements);
+        Splat = getVectorSplat(Splat, NumElements);
 
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
-      V = insertVector(IRB, Old, Splat, BeginIndex, getName(".vec"));
+                                         "oldload");
+      V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
     } else if (IntTy) {
       // If this is a memset on an alloca where we can widen stores, insert the
       // set integer.
       assert(!II.isVolatile());
 
       uint64_t Size = EndOffset - BeginOffset;
-      V = getIntegerSplat(IRB, II.getValue(), Size);
+      V = getIntegerSplat(II.getValue(), Size);
 
       if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
                     EndOffset != NewAllocaBeginOffset)) {
         Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                           getName(".oldload"));
+                                           "oldload");
         Old = convertValue(TD, IRB, Old, IntTy);
         assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
         uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-        V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
+        V = insertInteger(TD, IRB, Old, V, Offset, "insert");
       } else {
         assert(V->getType() == IntTy &&
                "Wrong type for an alloca wide integer!");
@@ -2767,10 +2815,9 @@ private:
       assert(BeginOffset == NewAllocaBeginOffset);
       assert(EndOffset == NewAllocaEndOffset);
 
-      V = getIntegerSplat(IRB, II.getValue(),
-                          TD.getTypeSizeInBits(ScalarTy)/8);
+      V = getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ScalarTy) / 8);
       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
-        V = getVectorSplat(IRB, V, AllocaVecTy->getNumElements());
+        V = getVectorSplat(V, AllocaVecTy->getNumElements());
 
       V = convertValue(TD, IRB, V, AllocaTy);
     }
@@ -2787,7 +2834,6 @@ private:
     // them into two categories: split intrinsics and unsplit intrinsics.
 
     DEBUG(dbgs() << "    original: " << II << "\n");
-    IRBuilder<> IRB(&II);
 
     assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
     bool IsDest = II.getRawDest() == OldPtr;
@@ -2871,8 +2917,7 @@ private:
 
       // Compute the other pointer, folding as much as possible to produce
       // a single, simple GEP in most cases.
-      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
-                                getName("." + OtherPtr->getName()));
+      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
 
       Value *OurPtr
         = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
@@ -2915,8 +2960,7 @@ private:
       OtherPtrTy = SubIntTy->getPointerTo();
     }
 
-    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
-                                   getName("." + OtherPtr->getName()));
+    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
     Value *DstPtr = &NewAI;
     if (!IsDest)
       std::swap(SrcPtr, DstPtr);
@@ -2924,31 +2968,31 @@ private:
     Value *Src;
     if (VecTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                  getName(".load"));
-      Src = extractVector(IRB, Src, BeginIndex, EndIndex, getName(".vec"));
+                                  "load");
+      Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                  getName(".load"));
+                                  "load");
       Src = convertValue(TD, IRB, Src, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract"));
+      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, "extract");
     } else {
       Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
-                                  getName(".copyload"));
+                                  "copyload");
     }
 
     if (VecTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
-      Src = insertVector(IRB, Old, Src, BeginIndex, getName(".vec"));
+                                         "oldload");
+      Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
     } else if (IntTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
+                                         "oldload");
       Old = convertValue(TD, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert"));
+      Src = insertInteger(TD, IRB, Old, Src, Offset, "insert");
       Src = convertValue(TD, IRB, Src, NewAllocaTy);
     }
 
@@ -2963,7 +3007,6 @@ private:
     assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
            II.getIntrinsicID() == Intrinsic::lifetime_end);
     DEBUG(dbgs() << "    original: " << II << "\n");
-    IRBuilder<> IRB(&II);
     assert(II.getArgOperand(1) == OldPtr);
 
     // Record this instruction for deletion.
@@ -2991,7 +3034,9 @@ private:
     // as local as possible to the PHI. To do that, we re-use the location of
     // the old pointer, which necessarily must be in the right position to
     // dominate the PHI.
-    IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr));
+    IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr));
+    PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
+                             ".");
 
     Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
     // Replace the operands which were using the old pointer.
@@ -3004,17 +3049,16 @@ private:
 
   bool visitSelectInst(SelectInst &SI) {
     DEBUG(dbgs() << "    original: " << SI << "\n");
-    IRBuilder<> IRB(&SI);
-
-    // Find the operand we need to rewrite here.
-    bool IsTrueVal = SI.getTrueValue() == OldPtr;
-    if (IsTrueVal)
-      assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!");
-    else
-      assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!");
+    assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
+           "Pointer isn't an operand!");
 
     Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType());
-    SI.setOperand(IsTrueVal ? 1 : 2, NewPtr);
+    // Replace the operands which were using the old pointer.
+    if (SI.getOperand(1) == OldPtr)
+      SI.setOperand(1, NewPtr);
+    if (SI.getOperand(2) == OldPtr)
+      SI.setOperand(2, NewPtr);
+
     DEBUG(dbgs() << "          to: " << SI << "\n");
     deleteIfTriviallyDead(OldPtr);
     return false;
@@ -3079,7 +3123,7 @@ private:
   class OpSplitter {
   protected:
     /// The builder used to form new instructions.
-    IRBuilder<> IRB;
+    IRBuilderTy IRB;
     /// The indices which to be used with insert- or extractvalue to select the
     /// appropriate value within the aggregate.
     SmallVector<unsigned, 4> Indices;
@@ -3291,12 +3335,13 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
     Type *ElementTy = SeqTy->getElementType();
     uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
     uint64_t NumSkippedElements = Offset / ElementSize;
-    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy))
+    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
       if (NumSkippedElements >= ArrTy->getNumElements())
         return 0;
-    if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy))
+    } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
       if (NumSkippedElements >= VecTy->getNumElements())
         return 0;
+    }
     Offset -= NumSkippedElements * ElementSize;
 
     // First check if we need to recurse.
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index e590a37..bfde334 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -1462,8 +1462,8 @@ bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
 }
 
 // performScalarRepl - This algorithm is a simple worklist driven algorithm,
-// which runs on all of the alloca instructions in the function, removing them
-// if they are only used by getelementptr instructions.
+// which runs on all of the alloca instructions in the entry block, removing
+// them if they are only used by getelementptr instructions.
 //
 bool SROA::performScalarRepl(Function &F) {
   std::vector<AllocaInst*> WorkList;
@@ -1724,17 +1724,8 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
       continue;
 
     ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
-    if (!IdxVal) {
-      // Non constant GEPs are only a problem on arrays, structs, and pointers
-      // Vectors can be dynamically indexed.
-      // FIXME: Add support for dynamic indexing on arrays.  This should be
-      // ok on any subarrays of the alloca array, eg, a[0][i] is ok, but a[i][0]
-      // isn't.
-      if (!(*GEPIt)->isVectorTy())
-        return MarkUnsafe(Info, GEPI);
-      NonConstant = true;
-      NonConstantIdxSize = TD->getTypeAllocSize(*GEPIt);
-    }
+    if (!IdxVal)
+      return MarkUnsafe(Info, GEPI);
   }
 
   // Compute the offset due to this GEP and check if the alloca has a
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 916b37d..3514e6c 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
@@ -35,7 +34,6 @@
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 using namespace llvm;
 
-STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
 //===----------------------------------------------------------------------===//
 // Optimizer Base Class
@@ -91,8 +89,6 @@ namespace {
     TargetLibraryInfo *TLI;
 
     StringMap<LibCallOptimization*> Optimizations;
-
-    bool Modified;  // This is only used by doInitialization.
   public:
     static char ID; // Pass identification
     SimplifyLibCalls() : FunctionPass(ID) {
@@ -104,14 +100,6 @@ namespace {
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
-    void setDoesNotAccessMemory(Function &F);
-    void setOnlyReadsMemory(Function &F);
-    void setDoesNotThrow(Function &F);
-    void setDoesNotCapture(Function &F, unsigned n);
-    void setDoesNotAlias(Function &F, unsigned n);
-    bool doInitialization(Module &M);
-
-    void inferPrototypeAttributes(Function &F);
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<TargetLibraryInfo>();
     }
@@ -208,697 +196,6 @@ bool SimplifyLibCalls::runOnFunction(Function &F) {
   return Changed;
 }
 
-// Utility methods for doInitialization.
-
-void SimplifyLibCalls::setDoesNotAccessMemory(Function &F) {
-  if (!F.doesNotAccessMemory()) {
-    F.setDoesNotAccessMemory();
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setOnlyReadsMemory(Function &F) {
-  if (!F.onlyReadsMemory()) {
-    F.setOnlyReadsMemory();
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setDoesNotThrow(Function &F) {
-  if (!F.doesNotThrow()) {
-    F.setDoesNotThrow();
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setDoesNotCapture(Function &F, unsigned n) {
-  if (!F.doesNotCapture(n)) {
-    F.setDoesNotCapture(n);
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) {
-  if (!F.doesNotAlias(n)) {
-    F.setDoesNotAlias(n);
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-
-
-void SimplifyLibCalls::inferPrototypeAttributes(Function &F) {
-  FunctionType *FTy = F.getFunctionType();
-
-  StringRef Name = F.getName();
-  switch (Name[0]) {
-  case 's':
-    if (Name == "strlen") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "strchr" ||
-               Name == "strrchr") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isIntegerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-    } else if (Name == "strcpy" ||
-               Name == "stpcpy" ||
-               Name == "strcat" ||
-               Name == "strtol" ||
-               Name == "strtod" ||
-               Name == "strtof" ||
-               Name == "strtoul" ||
-               Name == "strtoll" ||
-               Name == "strtold" ||
-               Name == "strncat" ||
-               Name == "strncpy" ||
-               Name == "stpncpy" ||
-               Name == "strtoull") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strxfrm") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strcmp" ||
-               Name == "strspn" ||
-               Name == "strncmp" ||
-               Name == "strcspn" ||
-               Name == "strcoll" ||
-               Name == "strcasecmp" ||
-               Name == "strncasecmp") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strstr" ||
-               Name == "strpbrk") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strtok" ||
-               Name == "strtok_r") {
-      if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "scanf" ||
-               Name == "setbuf" ||
-               Name == "setvbuf") {
-      if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "strdup" ||
-               Name == "strndup") {
-      if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "stat" ||
-               Name == "sscanf" ||
-               Name == "sprintf" ||
-               Name == "statvfs") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "snprintf") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 3);
-    } else if (Name == "setitimer") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-      setDoesNotCapture(F, 3);
-    } else if (Name == "system") {
-      if (FTy->getNumParams() != 1 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      // May throw; "system" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'm':
-    if (Name == "malloc") {
-      if (FTy->getNumParams() != 1 ||
-          !FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "memcmp") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "memchr" ||
-               Name == "memrchr") {
-      if (FTy->getNumParams() != 3)
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-    } else if (Name == "modf" ||
-               Name == "modff" ||
-               Name == "modfl" ||
-               Name == "memcpy" ||
-               Name == "memccpy" ||
-               Name == "memmove") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "memalign") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotAlias(F, 0);
-    } else if (Name == "mkdir" ||
-               Name == "mktime") {
-      if (FTy->getNumParams() == 0 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'r':
-    if (Name == "realloc") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "read") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      // May throw; "read" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 2);
-    } else if (Name == "rmdir" ||
-               Name == "rewind" ||
-               Name == "remove" ||
-               Name == "realpath") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "rename" ||
-               Name == "readlink") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'w':
-    if (Name == "write") {
-      if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      // May throw; "write" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'b':
-    if (Name == "bcopy") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "bcmp") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setOnlyReadsMemory(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "bzero") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'c':
-    if (Name == "calloc") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "chmod" ||
-               Name == "chown" ||
-               Name == "ctermid" ||
-               Name == "clearerr" ||
-               Name == "closedir") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'a':
-    if (Name == "atoi" ||
-        Name == "atol" ||
-        Name == "atof" ||
-        Name == "atoll") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setOnlyReadsMemory(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "access") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'f':
-    if (Name == "fopen") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "fdopen") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "feof" ||
-               Name == "free" ||
-               Name == "fseek" ||
-               Name == "ftell" ||
-               Name == "fgetc" ||
-               Name == "fseeko" ||
-               Name == "ftello" ||
-               Name == "fileno" ||
-               Name == "fflush" ||
-               Name == "fclose" ||
-               Name == "fsetpos" ||
-               Name == "flockfile" ||
-               Name == "funlockfile" ||
-               Name == "ftrylockfile") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "ferror") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setOnlyReadsMemory(F);
-    } else if (Name == "fputc" ||
-               Name == "fstat" ||
-               Name == "frexp" ||
-               Name == "frexpf" ||
-               Name == "frexpl" ||
-               Name == "fstatvfs") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "fgets") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 3);
-    } else if (Name == "fread" ||
-               Name == "fwrite") {
-      if (FTy->getNumParams() != 4 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(3)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 4);
-    } else if (Name == "fputs" ||
-               Name == "fscanf" ||
-               Name == "fprintf" ||
-               Name == "fgetpos") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'g':
-    if (Name == "getc" ||
-        Name == "getlogin_r" ||
-        Name == "getc_unlocked") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "getenv") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setOnlyReadsMemory(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "gets" ||
-               Name == "getchar") {
-      setDoesNotThrow(F);
-    } else if (Name == "getitimer") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "getpwnam") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'u':
-    if (Name == "ungetc") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "uname" ||
-               Name == "unlink" ||
-               Name == "unsetenv") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "utime" ||
-               Name == "utimes") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'p':
-    if (Name == "putc") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "puts" ||
-               Name == "printf" ||
-               Name == "perror") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "pread" ||
-               Name == "pwrite") {
-      if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      // May throw; these are valid pthread cancellation points.
-      setDoesNotCapture(F, 2);
-    } else if (Name == "putchar") {
-      setDoesNotThrow(F);
-    } else if (Name == "popen") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "pclose") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'v':
-    if (Name == "vscanf") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "vsscanf" ||
-               Name == "vfscanf") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "valloc") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "vprintf") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "vfprintf" ||
-               Name == "vsprintf") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "vsnprintf") {
-      if (FTy->getNumParams() != 4 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 3);
-    }
-    break;
-  case 'o':
-    if (Name == "open") {
-      if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      // May throw; "open" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 1);
-    } else if (Name == "opendir") {
-      if (FTy->getNumParams() != 1 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 't':
-    if (Name == "tmpfile") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "times") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'h':
-    if (Name == "htonl" ||
-        Name == "htons") {
-      setDoesNotThrow(F);
-      setDoesNotAccessMemory(F);
-    }
-    break;
-  case 'n':
-    if (Name == "ntohl" ||
-        Name == "ntohs") {
-      setDoesNotThrow(F);
-      setDoesNotAccessMemory(F);
-    }
-    break;
-  case 'l':
-    if (Name == "lstat") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "lchown") {
-      if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'q':
-    if (Name == "qsort") {
-      if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
-        return;
-      // May throw; places call through function pointer.
-      setDoesNotCapture(F, 4);
-    }
-    break;
-  case '_':
-    if (Name == "__strdup" ||
-        Name == "__strndup") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "__strtok_r") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "_IO_getc") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "_IO_putc") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 1:
-    if (Name == "\1__isoc99_scanf") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "\1stat64" ||
-               Name == "\1lstat64" ||
-               Name == "\1statvfs64" ||
-               Name == "\1__isoc99_sscanf") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "\1fopen64") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "\1fseeko64" ||
-               Name == "\1ftello64") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "\1tmpfile64") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "\1fstat64" ||
-               Name == "\1fstatvfs64") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "\1open64") {
-      if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      // May throw; "open" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  }
-}
-
-/// doInitialization - Add attributes to well-known functions.
-///
-bool SimplifyLibCalls::doInitialization(Module &M) {
-  Modified = false;
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function &F = *I;
-    if (F.isDeclaration() && F.hasName())
-      inferPrototypeAttributes(F);
-  }
-  return Modified;
-}
-
 // TODO:
 //   Additional cases that we need to add to this file:
 //
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 0d2598a..dabb67b 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -82,7 +82,8 @@ namespace {
     /// a simple branch. When there is more than one predecessor, we need to
     /// split the landing pad block after the landingpad instruction and jump
     /// to there.
-    void forwardResume(ResumeInst *RI);
+    void forwardResume(ResumeInst *RI,
+                       SmallPtrSet<LandingPadInst*, 16> &InlinedLPads);
 
     /// addIncomingPHIValuesFor - Add incoming-PHI values to the unwind
     /// destination block for the given basic block, using the values for the
@@ -140,8 +141,10 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
 /// block. When the landing pad block has only one predecessor, this is a simple
 /// branch. When there is more than one predecessor, we need to split the
 /// landing pad block after the landingpad instruction and jump to there.
-void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
+void InvokeInliningInfo::forwardResume(ResumeInst *RI,
+                               SmallPtrSet<LandingPadInst*, 16> &InlinedLPads) {
   BasicBlock *Dest = getInnerResumeDest();
+  LandingPadInst *OuterLPad = getLandingPadInst();
   BasicBlock *Src = RI->getParent();
 
   BranchInst::Create(Dest, Src);
@@ -152,6 +155,16 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
 
   InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
   RI->eraseFromParent();
+
+  // Append the clauses from the outer landing pad instruction into the inlined
+  // landing pad instructions.
+  for (SmallPtrSet<LandingPadInst*, 16>::iterator I = InlinedLPads.begin(),
+         E = InlinedLPads.end(); I != E; ++I) {
+    LandingPadInst *InlinedLPad = *I;
+    for (unsigned OuterIdx = 0, OuterNum = OuterLPad->getNumClauses();
+         OuterIdx != OuterNum; ++OuterIdx)
+      InlinedLPad->addClause(OuterLPad->getClause(OuterIdx));
+  }
 }
 
 /// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into
@@ -229,19 +242,15 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
 
   // The inlined code is currently at the end of the function, scan from the
   // start of the inlined code to its end, checking for stuff we need to
-  // rewrite.  If the code doesn't have calls or unwinds, we know there is
-  // nothing to rewrite.
-  if (!InlinedCodeInfo.ContainsCalls) {
-    // Now that everything is happy, we have one final detail.  The PHI nodes in
-    // the exception destination block still have entries due to the original
-    // invoke instruction.  Eliminate these entries (which might even delete the
-    // PHI node) now.
-    InvokeDest->removePredecessor(II->getParent());
-    return;
-  }
-
+  // rewrite.
   InvokeInliningInfo Invoke(II);
-  
+
+  // Get all of the inlined landing pad instructions.
+  SmallPtrSet<LandingPadInst*, 16> InlinedLPads;
+  for (Function::iterator I = FirstNewBlock, E = Caller->end(); I != E; ++I)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))
+      InlinedLPads.insert(II->getLandingPadInst());
+
   for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){
     if (InlinedCodeInfo.ContainsCalls)
       if (HandleCallsInBlockInlinedThroughInvoke(BB, Invoke)) {
@@ -250,13 +259,14 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
         continue;
       }
 
+    // Forward any resumes that are remaining here.
     if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
-      Invoke.forwardResume(RI);
+      Invoke.forwardResume(RI, InlinedLPads);
   }
 
   // Now that everything is happy, we have one final detail.  The PHI nodes in
   // the exception destination block still have entries due to the original
-  // invoke instruction.  Eliminate these entries (which might even delete the
+  // invoke instruction. Eliminate these entries (which might even delete the
   // PHI node) now.
   InvokeDest->removePredecessor(II->getParent());
 }
@@ -748,8 +758,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
     // If the call site was an invoke instruction, add a branch to the normal
     // destination.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
-      BranchInst::Create(II->getNormalDest(), TheCall);
+    if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall);
+      NewBr->setDebugLoc(Returns[0]->getDebugLoc());
+    }
 
     // If the return instruction returned a value, replace uses of the call with
     // uses of the returned value.
@@ -777,15 +789,16 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // "starter" and "ender" blocks.  How we accomplish this depends on whether
   // this is an invoke instruction or a call instruction.
   BasicBlock *AfterCallBB;
+  BranchInst *CreatedBranchToNormalDest = NULL;
   if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
 
     // Add an unconditional branch to make this look like the CallInst case...
-    BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall);
+    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), TheCall);
 
     // Split the basic block.  This guarantees that no PHI nodes will have to be
     // updated due to new incoming edges, and make the invoke case more
     // symmetric to the call case.
-    AfterCallBB = OrigBB->splitBasicBlock(NewBr,
+    AfterCallBB = OrigBB->splitBasicBlock(CreatedBranchToNormalDest,
                                           CalledFunc->getName()+".exit");
 
   } else {  // It's a call
@@ -840,11 +853,20 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
 
     // Add a branch to the merge points and remove return instructions.
+    DebugLoc Loc;
     for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
       ReturnInst *RI = Returns[i];
-      BranchInst::Create(AfterCallBB, RI);
+      BranchInst* BI = BranchInst::Create(AfterCallBB, RI);
+      Loc = RI->getDebugLoc();
+      BI->setDebugLoc(Loc);
       RI->eraseFromParent();
     }
+    // We need to set the debug location to *somewhere* inside the
+    // inlined function. The line number may be nonsensical, but the
+    // instruction will at least be associated with the right
+    // function.
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(Loc);
   } else if (!Returns.empty()) {
     // Otherwise, if there is exactly one return value, just replace anything
     // using the return value of the call with the computed value.
@@ -864,6 +886,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     AfterCallBB->getInstList().splice(AfterCallBB->begin(),
                                       ReturnBB->getInstList());
 
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(Returns[0]->getDebugLoc());
+
     // Delete the return instruction now and empty ReturnBB now.
     Returns[0]->eraseFromParent();
     ReturnBB->eraseFromParent();
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index a54ee08..12e5b3e 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -832,7 +832,24 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
 ///  Dbg Intrinsic utilities
 ///
 
-/// Inserts a llvm.dbg.value instrinsic before the stores to an alloca'd value
+/// See if there is a dbg.value intrinsic for DIVar before I.
+static bool LdStHasDebugValue(DIVariable &DIVar, Instruction *I) {
+  // Since we can't guarantee that the original dbg.declare instrinsic
+  // is removed by LowerDbgDeclare(), we need to make sure that we are
+  // not inserting the same dbg.value intrinsic over and over.
+  llvm::BasicBlock::InstListType::iterator PrevI(I);
+  if (PrevI != I->getParent()->getInstList().begin()) {
+    --PrevI;
+    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(PrevI))
+      if (DVI->getValue() == I->getOperand(0) &&
+          DVI->getOffset() == 0 &&
+          DVI->getVariable() == DIVar)
+        return true;
+  }
+  return false;
+}
+
+/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
 /// that has an associated llvm.dbg.decl intrinsic.
 bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            StoreInst *SI, DIBuilder &Builder) {
@@ -840,6 +857,9 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (!DIVar.Verify())
     return false;
 
+  if (LdStHasDebugValue(DIVar, SI))
+    return true;
+
   Instruction *DbgVal = NULL;
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
@@ -863,7 +883,7 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   return true;
 }
 
-/// Inserts a llvm.dbg.value instrinsic before the stores to an alloca'd value
+/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
 /// that has an associated llvm.dbg.decl intrinsic.
 bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            LoadInst *LI, DIBuilder &Builder) {
@@ -871,6 +891,9 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (!DIVar.Verify())
     return false;
 
+  if (LdStHasDebugValue(DIVar, LI))
+    return true;
+
   Instruction *DbgVal = 
     Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0,
                                     DIVar, LI);
@@ -902,6 +925,8 @@ bool llvm::LowerDbgDeclare(Function &F) {
          E = Dbgs.end(); I != E; ++I) {
     DbgDeclareInst *DDI = *I;
     if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) {
+      // We only remove the dbg.declare intrinsic if all uses are
+      // converted to dbg.value intrinsics.
       bool RemoveDDI = true;
       for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
            UI != E; ++UI)
@@ -985,22 +1010,17 @@ bool llvm::removeUnreachableBlocks(Function &F) {
     if (Reachable.count(I))
       continue;
 
-    // Remove the block as predecessor of all its reachable successors.
-    // Unreachable successors don't matter as they'll soon be removed, too.
     for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI)
       if (Reachable.count(*SI))
         (*SI)->removePredecessor(I);
+    I->dropAllReferences();
+  }
 
-    // Zap all instructions in this basic block.
-    while (!I->empty()) {
-      Instruction &Inst = I->back();
-      if (!Inst.use_empty())
-        Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
-      I->getInstList().pop_back();
-    }
+  for (Function::iterator I = llvm::next(F.begin()), E=F.end(); I != E;)
+    if (!Reachable.count(I))
+      I = F.getBasicBlockList().erase(I);
+    else
+      ++I;
 
-    --I;
-    llvm::next(I)->eraseFromParent();
-  }
   return true;
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 681bf9c..052ad85 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -59,6 +59,10 @@ static cl::opt<bool>
 SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
        cl::desc("Sink common instructions down to the end block"));
 
+static cl::opt<bool>
+HoistCondStores("simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
+       cl::desc("Hoist conditional stores if an unconditional store preceeds"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
 STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
@@ -1332,6 +1336,66 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   return Changed;
 }
 
+/// \brief Determine if we can hoist sink a sole store instruction out of a
+/// conditional block.
+///
+/// We are looking for code like the following:
+///   BrBB:
+///     store i32 %add, i32* %arrayidx2
+///     ... // No other stores or function calls (we could be calling a memory
+///     ... // function).
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     store i32 %add5, i32* %arrayidx2
+///     br label EndBB
+///   EndBB:
+///     ...
+///   We are going to transform this into:
+///   BrBB:
+///     store i32 %add, i32* %arrayidx2
+///     ... //
+///     %cmp = icmp ult %x, %y
+///     %add.add5 = select i1 %cmp, i32 %add, %add5
+///     store i32 %add.add5, i32* %arrayidx2
+///     ...
+///
+/// \return The pointer to the value of the previous store if the store can be
+///         hoisted into the predecessor block. 0 otherwise.
+Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
+                              BasicBlock *StoreBB, BasicBlock *EndBB) {
+  StoreInst *StoreToHoist = dyn_cast<StoreInst>(I);
+  if (!StoreToHoist)
+    return 0;
+
+  // Volatile or atomic.
+  if (!StoreToHoist->isSimple())
+    return 0;
+
+  Value *StorePtr = StoreToHoist->getPointerOperand();
+
+  // Look for a store to the same pointer in BrBB.
+  unsigned MaxNumInstToLookAt = 10;
+  for (BasicBlock::reverse_iterator RI = BrBB->rbegin(),
+       RE = BrBB->rend(); RI != RE && (--MaxNumInstToLookAt); ++RI) {
+    Instruction *CurI = &*RI;
+
+    // Could be calling an instruction that effects memory like free().
+    if (CurI->mayHaveSideEffects() && !isa<StoreInst>(CurI))
+      return 0;
+
+    StoreInst *SI = dyn_cast<StoreInst>(CurI);
+    // Found the previous store make sure it stores to the same location.
+    if (SI && SI->getPointerOperand() == StorePtr)
+      // Found the previous store, return its value operand.
+      return SI->getValueOperand();
+    else if (SI)
+      return 0; // Unknown store.
+  }
+
+  return 0;
+}
+
 /// \brief Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
@@ -1395,6 +1459,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
   SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
 
   unsigned SpeculationCost = 0;
+  Value *SpeculatedStoreValue = 0;
+  StoreInst *SpeculatedStore = 0;
   for (BasicBlock::iterator BBI = ThenBB->begin(),
                             BBE = llvm::prior(ThenBB->end());
        BBI != BBE; ++BBI) {
@@ -1410,13 +1476,21 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
       return false;
 
     // Don't hoist the instruction if it's unsafe or expensive.
-    if (!isSafeToSpeculativelyExecute(I))
+    if (!isSafeToSpeculativelyExecute(I) &&
+        !(HoistCondStores &&
+          (SpeculatedStoreValue = isSafeToSpeculateStore(I, BB, ThenBB,
+                                                         EndBB))))
       return false;
-    if (ComputeSpeculationCost(I) > PHINodeFoldingThreshold)
+    if (!SpeculatedStoreValue &&
+        ComputeSpeculationCost(I) > PHINodeFoldingThreshold)
       return false;
 
+    // Store the store speculation candidate.
+    if (SpeculatedStoreValue)
+      SpeculatedStore = cast<StoreInst>(I);
+
     // Do not hoist the instruction if any of its operands are defined but not
-    // used in this BB. The transformation will prevent the operand from
+    // used in BB. The transformation will prevent the operand from
     // being sunk into the use block.
     for (User::op_iterator i = I->op_begin(), e = I->op_end();
          i != e; ++i) {
@@ -1473,12 +1547,24 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
 
   // If there are no PHIs to process, bail early. This helps ensure idempotence
   // as well.
-  if (!HaveRewritablePHIs)
+  if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
     return false;
 
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
+  // Insert a select of the value of the speculated store.
+  if (SpeculatedStoreValue) {
+    IRBuilder<true, NoFolder> Builder(BI);
+    Value *TrueV = SpeculatedStore->getValueOperand();
+    Value *FalseV = SpeculatedStoreValue;
+    if (Invert)
+      std::swap(TrueV, FalseV);
+    Value *S = Builder.CreateSelect(BrCond, TrueV, FalseV, TrueV->getName() +
+                                    "." + FalseV->getName());
+    SpeculatedStore->setOperand(0, S);
+  }
+
   // Hoist the instructions.
   BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(),
                            llvm::prior(ThenBB->end()));
@@ -3073,7 +3159,12 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   Value *Sub = SI->getCondition();
   if (!Offset->isNullValue())
     Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
-  Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+  Value *Cmp;
+  // If NumCases overflowed, then all possible values jump to the successor.
+  if (NumCases->isNullValue() && SI->getNumCases() != 0)
+    Cmp = ConstantInt::getTrue(SI->getContext());
+  else
+    Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
   BranchInst *NewBI = Builder.CreateCondBr(
       Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest());
 
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index c231704..6bea2dd 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1518,6 +1518,12 @@ struct FPrintFOpt : public LibCallOptimization {
     if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
       return 0;
 
+    // Do not do any of the following transformations if the fprintf return
+    // value is used, in general the fprintf return value is not compatible
+    // with fwrite(), fputc() or fputs().
+    if (!CI->use_empty())
+      return 0;
+
     // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
     if (CI->getNumArgOperands() == 2) {
       for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
@@ -1527,11 +1533,10 @@ struct FPrintFOpt : public LibCallOptimization {
       // These optimizations require DataLayout.
       if (!TD) return 0;
 
-      Value *NewCI = EmitFWrite(CI->getArgOperand(1),
-                                ConstantInt::get(TD->getIntPtrType(*Context),
-                                                 FormatStr.size()),
-                                CI->getArgOperand(0), B, TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
+      return EmitFWrite(CI->getArgOperand(1),
+                        ConstantInt::get(TD->getIntPtrType(*Context),
+                                         FormatStr.size()),
+                        CI->getArgOperand(0), B, TD, TLI);
     }
 
     // The remaining optimizations require the format string to be "%s" or "%c"
@@ -1544,14 +1549,12 @@ struct FPrintFOpt : public LibCallOptimization {
     if (FormatStr[1] == 'c') {
       // fprintf(F, "%c", chr) --> fputc(chr, F)
       if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
-      Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B,
-                               TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+      return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
     }
 
     if (FormatStr[1] == 's') {
       // fprintf(F, "%s", str) --> fputs(str, F)
-      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
+      if (!CI->getArgOperand(2)->getType()->isPointerTy())
         return 0;
       return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
     }
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index 5812d46..c3df215 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
 #include "llvm-c/Initialization.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/Vectorize/Android.mk b/lib/Transforms/Vectorize/Android.mk
index de03793..58698fe 100644
--- a/lib/Transforms/Vectorize/Android.mk
+++ b/lib/Transforms/Vectorize/Android.mk
@@ -3,7 +3,9 @@ LOCAL_PATH:= $(call my-dir)
 transforms_vectorize_SRC_FILES := \
   BBVectorize.cpp \
   LoopVectorize.cpp \
-  Vectorize.cpp
+  SLPVectorizer.cpp \
+  Vectorize.cpp \
+  VecUtils.cpp
 
 # For the host
 # =====================================================
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index e64034a..7ae082f 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -2,6 +2,8 @@ add_llvm_library(LLVMVectorize
   BBVectorize.cpp
   Vectorize.cpp
   LoopVectorize.cpp
+  SLPVectorizer.cpp
+  VecUtils.cpp
   )
 
 add_dependencies(LLVMVectorize intrinsics_gen)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 930d9c4..9a832f7 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -78,6 +78,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
@@ -87,6 +88,7 @@
 #include <map>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
@@ -112,9 +114,9 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
 /// We don't unroll loops with a known constant trip count below this number.
 static const unsigned TinyTripCountUnrollThreshold = 128;
 
-/// When performing a runtime memory check, do not check more than this
-/// number of pointers. Notice that the check is quadratic!
-static const unsigned RuntimeMemoryCheckThreshold = 4;
+/// When performing memory disambiguation checks at runtime do not make more
+/// than this number of comparisons.
+static const unsigned RuntimeMemoryCheckThreshold = 8;
 
 /// We use a metadata with this name  to indicate that a scalar loop was
 /// vectorized and that we don't need to re-vectorize it if we run into it
@@ -343,6 +345,7 @@ public:
     RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
     RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
+    RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
     RK_FloatAdd,    ///< Sum of floats.
     RK_FloatMult    ///< Product of floats.
   };
@@ -356,13 +359,23 @@ public:
     IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
   };
 
+  // This enum represents the kind of minmax reduction.
+  enum MinMaxReductionKind {
+    MRK_Invalid,
+    MRK_UIntMin,
+    MRK_UIntMax,
+    MRK_SIntMin,
+    MRK_SIntMax
+  };
+
   /// This POD struct holds information about reduction variables.
   struct ReductionDescriptor {
     ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
-      Kind(RK_NoReduction) {}
+      Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
 
-    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
-        : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
+                        MinMaxReductionKind MK)
+        : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
 
     // The starting value of the reduction.
     // It does not have to be zero!
@@ -371,6 +384,25 @@ public:
     Instruction *LoopExitInstr;
     // The kind of the reduction.
     ReductionKind Kind;
+    // If this a min/max reduction the kind of reduction.
+    MinMaxReductionKind MinMaxKind;
+  };
+
+  /// This POD struct holds information about a potential reduction operation.
+  struct ReductionInstDesc {
+    ReductionInstDesc(bool IsRedux, Instruction *I) :
+      IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
+
+    ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
+      IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
+
+    // Is this instruction a reduction candidate.
+    bool IsReduction;
+    // The last instruction in a min/max pattern (select of the select(icmp())
+    // pattern), or the current reduction instruction otherwise.
+    Instruction *PatternLastInst;
+    // If this is a min/max pattern the comparison predicate.
+    MinMaxReductionKind MinMaxKind;
   };
 
   // This POD struct holds information about the memory runtime legality
@@ -387,7 +419,7 @@ public:
     }
 
     /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
 
     /// This flag indicates if we need to add the runtime check.
     bool Need;
@@ -397,6 +429,8 @@ public:
     SmallVector<const SCEV*, 2> Starts;
     /// Holds the pointer value at the end of the loop.
     SmallVector<const SCEV*, 2> Ends;
+    /// Holds the information if this pointer is used for writing to memory.
+    SmallVector<bool, 2> IsWritePtr;
   };
 
   /// A POD for saving information about induction variables.
@@ -461,6 +495,11 @@ public:
 
   /// Returns the information that we collected about runtime memory check.
   RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
+
+  /// This function returns the identity element (or neutral element) for
+  /// the operation K.
+  static Constant *getReductionIdentity(ReductionKind K, Type *Tp,
+                                        MinMaxReductionKind MinMaxK);
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -487,9 +526,17 @@ private:
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
-  /// Returns true if the instruction I can be a reduction variable of type
-  /// 'Kind'.
-  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns a struct describing if the instruction 'I' can be a reduction
+  /// variable of type 'Kind'. If the reduction is a min/max pattern of
+  /// select(icmp()) this function advances the instruction pointer 'I' from the
+  /// compare instruction to the select instruction and stores this pointer in
+  /// 'PatternLastInst' member of the returned struct.
+  ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
+                                     ReductionInstDesc &Desc);
+  /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+  /// pattern corresponding to a min(X, Y) or max(X, Y).
+  static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
+                                                    ReductionInstDesc &Prev);
   /// Returns the induction kind of Phi. This function may return NoInduction
   /// if the PHI is not an induction variable.
   InductionKind isInductionVariable(PHINode *Phi);
@@ -662,6 +709,11 @@ struct LoopVectorize : public LoopPass {
     AA = getAnalysisIfAvailable<AliasAnalysis>();
     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
+    if (DL == NULL) {
+      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+      return false;
+    }
+
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
@@ -737,7 +789,8 @@ struct LoopVectorize : public LoopPass {
 
 void
 LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
-                                                       Loop *Lp, Value *Ptr) {
+                                                       Loop *Lp, Value *Ptr,
+                                                       bool WritePtr) {
   const SCEV *Sc = SE->getSCEV(Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
@@ -746,6 +799,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
   Pointers.push_back(Ptr);
   Starts.push_back(AR->getStart());
   Ends.push_back(ScEnd);
+  IsWritePtr.push_back(WritePtr);
 }
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@@ -906,12 +960,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
 
+  unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
+  unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+
+  if (ScalarAllocatedSize != VectorElementSize)
+    return scalarizeInstruction(Instr);
+
   // If the pointer is loop invariant or if it is non consecutive,
   // scalarize the load.
-  int Stride = Legal->isConsecutivePtr(Ptr);
-  bool Reverse = Stride < 0;
+  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+  bool Reverse = ConsecutiveStride < 0;
   bool UniformLoad = LI && Legal->isUniform(Ptr);
-  if (Stride == 0 || UniformLoad)
+  if (!ConsecutiveStride || UniformLoad)
     return scalarizeInstruction(Instr);
 
   Constant *Zero = Builder.getInt32(0);
@@ -1040,10 +1100,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
-  // For each scalar that we create:
-  for (unsigned Width = 0; Width < VF; ++Width) {
-    // For each vector unroll 'part':
-    for (unsigned Part = 0; Part < UF; ++Part) {
+  // For each vector unroll 'part':
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // For each scalar that we create:
+    for (unsigned Width = 0; Width < VF; ++Width) {
       Instruction *Cloned = Instr->clone();
       if (!IsVoidRetTy)
         Cloned->setName(Instr->getName() + ".cloned");
@@ -1110,6 +1170,10 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
 
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i+1; j < NumPointers; ++j) {
+      // No need to check if two readonly pointers intersect.
+      if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
+        continue;
+
       Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
       Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
       Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
@@ -1436,26 +1500,45 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
-static Constant*
-getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
+Constant*
+LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp,
+                                                MinMaxReductionKind MinMaxK) {
   switch (K) {
-  case LoopVectorizationLegality:: RK_IntegerXor:
-  case LoopVectorizationLegality:: RK_IntegerAdd:
-  case LoopVectorizationLegality:: RK_IntegerOr:
+  case RK_IntegerXor:
+  case RK_IntegerAdd:
+  case RK_IntegerOr:
     // Adding, Xoring, Oring zero to a number does not change it.
     return ConstantInt::get(Tp, 0);
-  case LoopVectorizationLegality:: RK_IntegerMult:
+  case RK_IntegerMult:
     // Multiplying a number by 1 does not change it.
     return ConstantInt::get(Tp, 1);
-  case LoopVectorizationLegality:: RK_IntegerAnd:
+  case RK_IntegerAnd:
     // AND-ing a number with an all-1 value does not change it.
     return ConstantInt::get(Tp, -1, true);
-  case LoopVectorizationLegality:: RK_FloatMult:
+  case  RK_FloatMult:
     // Multiplying a number by 1 does not change it.
     return ConstantFP::get(Tp, 1.0L);
-  case LoopVectorizationLegality:: RK_FloatAdd:
+  case  RK_FloatAdd:
     // Adding zero to a number does not change it.
     return ConstantFP::get(Tp, 0.0L);
+  case  RK_IntegerMinMax:
+    switch(MinMaxK) {
+    default: llvm_unreachable("Unknown min/max predicate");
+    case MRK_UIntMin:
+      return ConstantInt::getAllOnesValue(Tp);
+    case MRK_UIntMax:
+      return ConstantInt::get(Tp, 0);
+    case MRK_SIntMin: {
+      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
+      return ConstantInt::get(Tp->getContext(),
+                              APInt::getSignedMaxValue(BitWidth));
+    }
+    case LoopVectorizationLegality::MRK_SIntMax: {
+      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
+      return ConstantInt::get(Tp->getContext(),
+                              APInt::getSignedMinValue(BitWidth));
+    }
+    }
   default:
     llvm_unreachable("Unknown reduction kind");
   }
@@ -1566,7 +1649,7 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
 }
 
 /// This function translates the reduction kind to an LLVM binary operator.
-static Instruction::BinaryOps
+static unsigned
 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
   switch (Kind) {
     case LoopVectorizationLegality::RK_IntegerAdd:
@@ -1583,11 +1666,38 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
       return Instruction::FMul;
     case LoopVectorizationLegality::RK_FloatAdd:
       return Instruction::FAdd;
+    case LoopVectorizationLegality::RK_IntegerMinMax:
+      return Instruction::ICmp;
     default:
       llvm_unreachable("Unknown reduction operation");
   }
 }
 
+Value *createMinMaxOp(IRBuilder<> &Builder,
+                      LoopVectorizationLegality::MinMaxReductionKind RK,
+                      Value *Left,
+                      Value *Right) {
+  CmpInst::Predicate P = CmpInst::ICMP_NE;
+  switch (RK) {
+  default:
+    llvm_unreachable("Unknown min/max reduction kind");
+  case LoopVectorizationLegality::MRK_UIntMin:
+    P = CmpInst::ICMP_ULT;
+    break;
+  case LoopVectorizationLegality::MRK_UIntMax:
+    P = CmpInst::ICMP_UGT;
+    break;
+  case LoopVectorizationLegality::MRK_SIntMin:
+    P = CmpInst::ICMP_SLT;
+    break;
+  case LoopVectorizationLegality::MRK_SIntMax:
+    P = CmpInst::ICMP_SGT;
+  }
+  Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+  Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
+  return Select;
+}
+
 void
 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -1651,7 +1761,10 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Find the reduction identity variable. Zero for addition, or, xor,
     // one for multiplication, -1 for And.
-    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
+    Constant *Iden =
+      LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+                                                      VecTy->getScalarType(),
+                                                      RdxDesc.MinMaxKind);
     Constant *Identity = ConstantVector::getSplat(VF, Iden);
 
     // This vector is the Identity vector where the first element is the
@@ -1699,10 +1812,15 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
+    unsigned Op = getReductionBinOp(RdxDesc.Kind);
     for (unsigned part = 1; part < UF; ++part) {
-      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
-      ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
-                                           "bin.rdx");
+      if (Op != Instruction::ICmp)
+        ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
+                                             RdxParts[part], ReducedPartRdx,
+                                             "bin.rdx");
+      else
+        ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
+                                        ReducedPartRdx, RdxParts[part]);
     }
 
     // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
@@ -1727,8 +1845,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
-      TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
+      if (Op != Instruction::ICmp)
+        TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+                                     "bin.rdx");
+      else
+        TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
     }
 
     // The result is in the first element of the vector.
@@ -2315,6 +2436,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
           continue;
         }
+        if (AddReductionVar(Phi, RK_IntegerMinMax)) {
+          DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
         if (AddReductionVar(Phi, RK_FloatMult)) {
           DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
           continue;
@@ -2442,13 +2567,6 @@ LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
 
-  if (TheLoop->isAnnotatedParallel()) {
-    DEBUG(dbgs()
-          << "LV: A loop annotated parallel, ignore memory dependency "
-          << "checks.\n");
-    return true;
-  }
-
   typedef SmallVector<Value*, 16> ValueVector;
   typedef SmallPtrSet<Value*, 16> ValueSet;
   // Holds the Load and Store *instructions*.
@@ -2457,6 +2575,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   PtrRtCheck.Pointers.clear();
   PtrRtCheck.Need = false;
 
+  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
   // For each block.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
        be = TheLoop->block_end(); bb != be; ++bb) {
@@ -2471,7 +2591,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       if (it->mayReadFromMemory()) {
         LoadInst *Ld = dyn_cast<LoadInst>(it);
         if (!Ld) return false;
-        if (!Ld->isSimple()) {
+        if (!Ld->isSimple() && !IsAnnotatedParallel) {
           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
           return false;
         }
@@ -2483,7 +2603,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       if (it->mayWriteToMemory()) {
         StoreInst *St = dyn_cast<StoreInst>(it);
         if (!St) return false;
-        if (!St->isSimple()) {
+        if (!St->isSimple() && !IsAnnotatedParallel) {
           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
           return false;
         }
@@ -2530,6 +2650,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       ReadWrites.insert(std::make_pair(Ptr, ST));
   }
 
+  if (IsAnnotatedParallel) {
+    DEBUG(dbgs()
+          << "LV: A loop annotated parallel, ignore memory dependency "
+          << "checks.\n");
+    return true;
+  }
+
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
     LoadInst *LD = cast<LoadInst>(*I);
     Value* Ptr = LD->getPointerOperand();
@@ -2552,6 +2679,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return true;
   }
 
+  unsigned NumReadPtrs = 0;
+  unsigned NumWritePtrs = 0;
+
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
@@ -2559,7 +2689,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
     Value *V = (*MI).first;
     if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V);
+      PtrRtCheck.insert(SE, TheLoop, V, true);
+      NumWritePtrs++;
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
@@ -2569,7 +2700,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
     Value *V = (*MI).first;
     if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V);
+      PtrRtCheck.insert(SE, TheLoop, V, false);
+      NumReadPtrs++;
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
@@ -2579,7 +2711,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
-  if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+  unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
+  DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
+  if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
     PtrRtCheck.reset();
     CanDoRT = false;
   }
@@ -2733,7 +2867,18 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // used as reduction variables (such as ADD). We may have a single
   // out-of-block user. The cycle must end with the original PHI.
   Instruction *Iter = Phi;
-  while (true) {
+
+  // To recognize min/max patterns formed by a icmp select sequence, we store
+  // the number of instruction we saw from the recognized min/max pattern,
+  // such that we don't stop when we see the phi has two uses (one by the select
+  // and one by the icmp) and to make sure we only see exactly the two
+  // instructions.
+  unsigned NumICmpSelectPatternInst = 0;
+  ReductionInstDesc ReduxDesc(false, 0);
+
+  // Avoid cycles in the chain.
+  SmallPtrSet<Instruction *, 8> VisitedInsts;
+  while (VisitedInsts.insert(Iter)) {
     // If the instruction has no users then this is a broken
     // chain and can't be a reduction variable.
     if (Iter->use_empty())
@@ -2747,9 +2892,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     // Is this a bin op ?
     FoundBinOp |= !isa<PHINode>(Iter);
 
-    // Remember the current instruction.
-    Instruction *OldIter = Iter;
-
     // For each of the *users* of iter.
     for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
          it != e; ++it) {
@@ -2778,25 +2920,33 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
           Iter->hasNUsesOrMore(2))
         continue;
 
-      // We can't have multiple inside users.
-      if (FoundInBlockUser)
+      // We can't have multiple inside users except for a combination of
+      // icmp/select both using the phi.
+      if (FoundInBlockUser && !NumICmpSelectPatternInst)
         return false;
       FoundInBlockUser = true;
 
       // Any reduction instr must be of one of the allowed kinds.
-      if (!isReductionInstr(U, Kind))
+      ReduxDesc = isReductionInstr(U, Kind, ReduxDesc);
+      if (!ReduxDesc.IsReduction)
         return false;
 
+      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) ||
+                                       isa<SelectInst>(U)))
+          ++NumICmpSelectPatternInst;
+
       // Reductions of instructions such as Div, and Sub is only
       // possible if the LHS is the reduction variable.
-      if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
+      if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
+          !isa<ICmpInst>(U) && U->getOperand(0) != Iter)
         return false;
 
-      Iter = U;
+      Iter = ReduxDesc.PatternLastInst;
     }
 
-    // If all uses were skipped this can't be a reduction variable.
-    if (Iter == OldIter)
+    // This means we have seen one but not the other instruction of the
+    // pattern or more than just a select and cmp.
+    if (Kind == RK_IntegerMinMax && NumICmpSelectPatternInst != 2)
       return false;
 
     // We found a reduction var if we have reached the original
@@ -2807,47 +2957,94 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       AllowedExit.insert(ExitInstruction);
 
       // Save the description of this reduction variable.
-      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
+                             ReduxDesc.MinMaxKind);
       Reductions[Phi] = RD;
       // We've ended the cycle. This is a reduction variable if we have an
       // outside user and it has a binary op.
       return FoundBinOp && ExitInstruction;
     }
   }
+
+  return false;
 }
 
-bool
+/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+/// pattern corresponding to a min(X, Y) or max(X, Y).
+LoopVectorizationLegality::ReductionInstDesc
+LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev) {
+
+  assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) &&
+         "Expect a select instruction");
+  ICmpInst *Cmp = 0;
+  SelectInst *Select = 0;
+
+  // We must handle the select(cmp()) as a single instruction. Advance to the
+  // select.
+  if ((Cmp = dyn_cast<ICmpInst>(I))) {
+    if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
+      return ReductionInstDesc(false, I);
+    return ReductionInstDesc(Select, Prev.MinMaxKind);
+  }
+
+  // Only handle single use cases for now.
+  if (!(Select = dyn_cast<SelectInst>(I)))
+    return ReductionInstDesc(false, I);
+  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))))
+    return ReductionInstDesc(false, I);
+  if (!Cmp->hasOneUse())
+    return ReductionInstDesc(false, I);
+
+  Value *CmpLeft = Cmp->getOperand(0);
+  Value *CmpRight = Cmp->getOperand(1);
+
+  // Look for a min/max pattern.
+  if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_UIntMin);
+  else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_UIntMax);
+  else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_SIntMax);
+  else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_SIntMin);
+
+  return ReductionInstDesc(false, I);
+}
+
+LoopVectorizationLegality::ReductionInstDesc
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
-                                            ReductionKind Kind) {
+                                            ReductionKind Kind,
+                                            ReductionInstDesc &Prev) {
   bool FP = I->getType()->isFloatingPointTy();
   bool FastMath = (FP && I->isCommutative() && I->isAssociative());
-
   switch (I->getOpcode()) {
   default:
-    return false;
+    return ReductionInstDesc(false, I);
   case Instruction::PHI:
       if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
-        return false;
-    // possibly.
-    return true;
+        return ReductionInstDesc(false, I);
+    return ReductionInstDesc(I, Prev.MinMaxKind);
   case Instruction::Sub:
   case Instruction::Add:
-    return Kind == RK_IntegerAdd;
-  case Instruction::SDiv:
-  case Instruction::UDiv:
+    return ReductionInstDesc(Kind == RK_IntegerAdd, I);
   case Instruction::Mul:
-    return Kind == RK_IntegerMult;
+    return ReductionInstDesc(Kind == RK_IntegerMult, I);
   case Instruction::And:
-    return Kind == RK_IntegerAnd;
+    return ReductionInstDesc(Kind == RK_IntegerAnd, I);
   case Instruction::Or:
-    return Kind == RK_IntegerOr;
+    return ReductionInstDesc(Kind == RK_IntegerOr, I);
   case Instruction::Xor:
-    return Kind == RK_IntegerXor;
+    return ReductionInstDesc(Kind == RK_IntegerXor, I);
   case Instruction::FMul:
-    return Kind == RK_FloatMult && FastMath;
+    return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
   case Instruction::FAdd:
-    return Kind == RK_FloatAdd && FastMath;
-   }
+    return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
+  case Instruction::ICmp:
+  case Instruction::Select:
+    if (Kind != RK_IntegerMinMax)
+      return ReductionInstDesc(false, I);
+    return isMinMaxSelectCmpPattern(I, Prev);
+  }
 }
 
 LoopVectorizationLegality::InductionKind
@@ -3331,8 +3528,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
-    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+  case Instruction::Xor: {
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueKind Op1VK =
+      TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+      TargetTransformInfo::OK_AnyValue;
+
+    if (isa<ConstantInt>(I->getOperand(1)))
+      Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
+  }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
@@ -3369,9 +3577,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
     // Scalarized loads/stores.
-    int Stride = Legal->isConsecutivePtr(Ptr);
-    bool Reverse = Stride < 0;
-    if (0 == Stride) {
+    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+    bool Reverse = ConsecutiveStride < 0;
+    unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
+    unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
+    if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
       unsigned Cost = 0;
       // The cost of extracting from the value vector and pointer vector.
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
new file mode 100644
index 0000000..cc30cc9
--- /dev/null
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -0,0 +1,348 @@
+//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
+// stores that can be put together into vector-stores. Next, it attempts to
+// construct vectorizable tree using the use-def chains. If a profitable tree
+// was found, the SLP vectorizer performs vectorization on the tree.
+//
+// The pass is inspired by the work described in the paper:
+//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
+//
+//===----------------------------------------------------------------------===//
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE SV_NAME
+
+#include "VecUtils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+using namespace llvm;
+
+static cl::opt<int>
+SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
+                 cl::desc("Only vectorize trees if the gain is above this "
+                          "number. (gain = -cost of vectorization)"));
+namespace {
+
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+  typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap;
+
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit SLPVectorizer() : FunctionPass(ID) {
+    initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  TargetTransformInfo *TTI;
+  AliasAnalysis *AA;
+  LoopInfo *LI;
+
+  virtual bool runOnFunction(Function &F) {
+    SE = &getAnalysis<ScalarEvolution>();
+    DL = getAnalysisIfAvailable<DataLayout>();
+    TTI = &getAnalysis<TargetTransformInfo>();
+    AA = &getAnalysis<AliasAnalysis>();
+    LI = &getAnalysis<LoopInfo>();
+
+    StoreRefs.clear();
+    bool Changed = false;
+
+    // Must have DataLayout. We can't require it because some tests run w/o
+    // triple.
+    if (!DL)
+      return false;
+
+    for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) {
+      BasicBlock *BB = it;
+      bool BBChanged = false;
+
+      // Use the bollom up slp vectorizer to construct chains that start with
+      // he store instructions.
+      BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
+
+      // Vectorize trees that end at reductions.
+      BBChanged |= vectorizeReductions(BB, R);
+
+      // Vectorize trees that end at stores.
+      if (unsigned count = collectStores(BB, R)) {
+        (void)count;
+        DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n");
+        BBChanged |= vectorizeStoreChains(R);
+      }
+
+      // Try to hoist some of the scalarization code to the preheader.
+      if (BBChanged) hoistGatherSequence(LI, BB, R);
+
+      Changed |= BBChanged;
+    }
+
+    if (Changed) {
+      DEBUG(dbgs()<<"SLP: vectorized \""<<F.getName()<<"\"\n");
+      DEBUG(verifyFunction(F));
+    }
+    return Changed;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<TargetTransformInfo>();
+    AU.addRequired<LoopInfo>();
+  }
+
+private:
+
+  /// \brief Collect memory references and sort them according to their base
+  /// object. We sort the stores to their base objects to reduce the cost of the
+  /// quadratic search on the stores. TODO: We can further reduce this cost
+  /// if we flush the chain creation every time we run into a memory barrier.
+  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
+
+  /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
+  bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R);
+
+  /// \brief Try to vectorize a list of operands.
+  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
+
+  /// \brief Try to vectorize a chain that may start at the operands of \V;
+  bool tryToVectorize(BinaryOperator *V,  BoUpSLP &R);
+
+  /// \brief Vectorize the stores that were collected in StoreRefs.
+  bool vectorizeStoreChains(BoUpSLP &R);
+
+  /// \brief Try to hoist gather sequences outside of the loop in cases where
+  /// all of the sources are loop invariant.
+  void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R);
+
+  /// \brief Scan the basic block and look for reductions that may start a
+  /// vectorization chain.
+  bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R);
+
+private:
+  StoreListMap StoreRefs;
+};
+
+unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
+  unsigned count = 0;
+  StoreRefs.clear();
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    StoreInst *SI = dyn_cast<StoreInst>(it);
+    if (!SI)
+      continue;
+
+    // Check that the pointer points to scalars.
+    Type *Ty = SI->getValueOperand()->getType();
+    if (Ty->isAggregateType() || Ty->isVectorTy())
+      return 0;
+
+    // Find the base of the GEP.
+    Value *Ptr = SI->getPointerOperand();
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+      Ptr = GEP->getPointerOperand();
+
+    // Save the store locations.
+    StoreRefs[Ptr].push_back(SI);
+    count++;
+  }
+  return count;
+}
+
+bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
+  if (!A || !B) return false;
+  Value *VL[] = { A, B };
+  return tryToVectorizeList(VL, R);
+}
+
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
+  DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+
+  // Check that all of the parts are scalar.
+  for (int i = 0, e = VL.size(); i < e; ++i) {
+    Type *Ty = VL[i]->getType();
+    if (Ty->isAggregateType() || Ty->isVectorTy())
+      return 0;
+  }
+
+  int Cost = R.getTreeCost(VL);
+  int ExtrCost = R.getScalarizationCost(VL);
+  DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
+        " Cost of extract:" << ExtrCost << ".\n");
+  if ((Cost+ExtrCost) >= -SLPCostThreshold) return false;
+  DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
+  R.vectorizeArith(VL);
+  return true;
+}
+
+bool SLPVectorizer::tryToVectorize(BinaryOperator *V,  BoUpSLP &R) {
+  if (!V) return false;
+  // Try to vectorize V.
+  if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+    return true;
+
+  BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+  BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+  // Try to skip B.
+  if (B && B->hasOneUse()) {
+    BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+    BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+    if (tryToVectorizePair(A, B0, R)) {
+      B->moveBefore(V);
+      return true;
+    }
+    if (tryToVectorizePair(A, B1, R)) {
+      B->moveBefore(V);
+      return true;
+    }
+  }
+
+  // Try to skip A.
+  if (A && A->hasOneUse()) {
+    BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+    BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+    if (tryToVectorizePair(A0, B, R)) {
+      A->moveBefore(V);
+      return true;
+    }
+    if (tryToVectorizePair(A1, B, R)) {
+      A->moveBefore(V);
+      return true;
+    }
+  }
+  return 0;
+}
+
+bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
+  bool Changed = false;
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    if (isa<DbgInfoIntrinsic>(it)) continue;
+
+    // Try to vectorize reductions that use PHINodes.
+    if (PHINode *P = dyn_cast<PHINode>(it)) {
+      // Check that the PHI is a reduction PHI.
+      if (P->getNumIncomingValues() != 2) return Changed;
+      Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
+                    (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) :
+                     0));
+      // Check if this is a Binary Operator.
+      BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+      if (!BI)
+        continue;
+
+      Value *Inst = BI->getOperand(0);
+      if (Inst == P) Inst = BI->getOperand(1);
+      Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
+      continue;
+    }
+
+    // Try to vectorize trees that start at compare instructions.
+    if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
+      if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
+        Changed |= true;
+        continue;
+      }
+      for (int i = 0; i < 2; ++i)
+        if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
+          Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R);
+      continue;
+    }
+  }
+
+  return Changed;
+}
+
+bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
+  bool Changed = false;
+  // Attempt to sort and vectorize each of the store-groups.
+  for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
+       it != e; ++it) {
+    if (it->second.size() < 2)
+      continue;
+
+    DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " <<
+          it->second.size() << ".\n");
+
+    Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
+  }
+  return Changed;
+}
+
+void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB,
+                                        BoUpSLP &R) {
+  // Check if this block is inside a loop.
+  Loop *L = LI->getLoopFor(BB);
+  if (!L)
+    return;
+
+  // Check if it has a preheader.
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  if (!PreHeader)
+    return;
+
+  // Mark the insertion point for the block.
+  Instruction *Location = PreHeader->getTerminator();
+
+  BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions();
+  for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end();
+       it != e; ++it) {
+    InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
+
+    // The InsertElement sequence can be simplified into a constant.
+    if (!Insert)
+      continue;
+
+    // If the vector or the element that we insert into it are
+    // instructions that are defined in this basic block then we can't
+    // hoist this instruction.
+    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
+    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
+    if (CurrVec && L->contains(CurrVec)) continue;
+    if (NewElem && L->contains(NewElem)) continue;
+
+    // We can hoist this instruction. Move it to the pre-header.
+    Insert->moveBefore(Location);
+  }
+}
+
+} // end anonymous namespace
+
+char SLPVectorizer::ID = 0;
+static const char lv_name[] = "SLP Vectorizer";
+INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
+
+namespace llvm {
+  Pass *createSLPVectorizerPass() {
+    return new SLPVectorizer();
+  }
+}
+
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
new file mode 100644
index 0000000..9b94366
--- /dev/null
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -0,0 +1,730 @@
+//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "SLP"
+
+#include "VecUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <map>
+
+using namespace llvm;
+
+static const unsigned MinVecRegSize = 128;
+
+static const unsigned RecursionMaxDepth = 6;
+
+namespace llvm {
+
+BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
+                 TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) :
+  BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp)  {
+  numberInstructions();
+}
+
+void BoUpSLP::numberInstructions() {
+  int Loc = 0;
+  InstrIdx.clear();
+  InstrVec.clear();
+  // Number the instructions in the block.
+  for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) {
+    InstrIdx[it] = Loc++;
+    InstrVec.push_back(it);
+    assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
+  }
+}
+
+Value *BoUpSLP::getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
+  return 0;
+}
+
+unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
+  if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
+  if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
+  return -1;
+}
+
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB)) return false;
+
+  // Check that A and B are of the same type.
+  if (PtrA->getType() != PtrB->getType()) return false;
+
+  // Calculate the distance.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
+  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
+
+  // Non constant distance.
+  if (!ConstOffSCEV) return false;
+
+  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  // The Instructions are connsecutive if the size of the first load/store is
+  // the same as the offset.
+  int64_t Sz = DL->getTypeStoreSize(Ty);
+  return ((-Offset) == Sz);
+}
+
+bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
+  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+  unsigned VF = MinVecRegSize / Sz;
+
+  if (!isPowerOf2_32(Sz) || VF < 2) return false;
+
+  bool Changed = false;
+  // Look for profitable vectorizable trees at all offsets, starting at zero.
+  for (unsigned i = 0, e = Chain.size(); i < e; ++i) {
+    if (i + VF > e) return Changed;
+    DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n");
+    ArrayRef<Value *> Operands = Chain.slice(i, VF);
+
+    int Cost = getTreeCost(Operands);
+    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+    if (Cost < CostThreshold) {
+      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      vectorizeTree(Operands, VF);
+      i += VF - 1;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
+  ValueSet Heads, Tails;
+  SmallDenseMap<Value*, Value*> ConsecutiveChain;
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we vectorized so that we don't visit the same store twice.
+  ValueSet VectorizedStores;
+  bool Changed = false;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of loads that follow each other.
+  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
+    for (unsigned j = 0; j < e; ++j) {
+      if (i == j) continue;
+      if (isConsecutiveAccess(Stores[i], Stores[j])) {
+        Tails.insert(Stores[j]);
+        Heads.insert(Stores[i]);
+        ConsecutiveChain[Stores[i]] = Stores[j];
+      }
+    }
+
+  // For stores that start but don't end a link in the chain:
+  for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) {
+    if (Tails.count(*it)) continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to vectorize it.
+    ValueList Operands;
+    Value *I = *it;
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (VectorizedStores.count(I)) break;
+      Operands.push_back(I);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
+
+    // Mark the vectorized stores so that we don't vectorize them again.
+    if (Vectorized)
+      VectorizedStores.insert(Operands.begin(), Operands.end());
+    Changed |= Vectorized;
+  }
+
+  return Changed;
+}
+
+int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) {
+  // Find the type of the operands in VL.
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  // Find the cost of inserting/extracting values from the vector.
+  return getScalarizationCost(VecTy);
+}
+
+int BoUpSLP::getScalarizationCost(Type *Ty) {
+  int Cost = 0;
+  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+  return Cost;
+}
+
+AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI);
+  return AliasAnalysis::Location();
+}
+
+Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
+  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
+  BasicBlock::iterator I = Src, E = Dst;
+  /// Scan all of the instruction from SRC to DST and check if
+  /// the source may alias.
+  for (++I; I != E; ++I) {
+    // Ignore store instructions that are marked as 'ignore'.
+    if (MemBarrierIgnoreList.count(I)) continue;
+    if (Src->mayWriteToMemory()) /* Write */ {
+      if (!I->mayReadOrWriteMemory()) continue;
+    } else /* Read */ {
+      if (!I->mayWriteToMemory()) continue;
+    }
+    AliasAnalysis::Location A = getLocation(&*I);
+    AliasAnalysis::Location B = getLocation(Src);
+
+    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
+      return I;
+  }
+  return 0;
+}
+
+void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
+  Value *Vec = vectorizeTree(Operands, Operands.size());
+  BasicBlock::iterator Loc = cast<Instruction>(Vec);
+  IRBuilder<> Builder(++Loc);
+  // After vectorizing the operands we need to generate extractelement
+  // instructions and replace all of the uses of the scalar values with
+  // the values that we extracted from the vectorized tree.
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
+    Operands[i]->replaceAllUsesWith(S);
+  }
+}
+
+int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
+  // Get rid of the list of stores that were removed, and from the
+  // lists of instructions with multiple users.
+  MemBarrierIgnoreList.clear();
+  LaneMap.clear();
+  MultiUserVals.clear();
+  MustScalarize.clear();
+
+  // Scan the tree and find which value is used by which lane, and which values
+  // must be scalarized.
+  getTreeUses_rec(VL, 0);
+
+  // Check that instructions with multiple users can be vectorized. Mark unsafe
+  // instructions.
+  for (ValueSet::iterator it = MultiUserVals.begin(),
+       e = MultiUserVals.end(); it != e; ++it) {
+    // Check that all of the users of this instr are within the tree
+    // and that they are all from the same lane.
+    int Lane = -1;
+    for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
+         I != E; ++I) {
+      if (LaneMap.find(*I) == LaneMap.end()) {
+        MustScalarize.insert(*it);
+        DEBUG(dbgs()<<"SLP: Adding " << **it <<
+              " to MustScalarize because of an out of tree usage.\n");
+        break;
+      }
+      if (Lane == -1) Lane = LaneMap[*I];
+      if (Lane != LaneMap[*I]) {
+        MustScalarize.insert(*it);
+        DEBUG(dbgs()<<"Adding " << **it <<
+              " to MustScalarize because multiple lane use it: "
+              << Lane << " and " << LaneMap[*I] << ".\n");
+        break;
+      }
+    }
+  }
+
+  // Now calculate the cost of vectorizing the tree.
+  return getTreeCost_rec(VL, 0);
+}
+
+void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
+  if (Depth == RecursionMaxDepth) return;
+
+  // Don't handle vectors.
+  if (VL[0]->getType()->isVectorTy()) return;
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    if (SI->getValueOperand()->getType()->isVectorTy()) return;
+
+  // Check if all of the operands are constants.
+  bool AllConst = true;
+  bool AllSameScalar = true;
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    AllConst &= isa<Constant>(VL[i]);
+    AllSameScalar &= (VL[0] == VL[i]);
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If one of the instructions is out of this BB, we need to scalarize all.
+    if (I && I->getParent() != BB) return;
+  }
+
+  // If all of the operands are identical or constant we have a simple solution.
+  if (AllConst || AllSameScalar) return;
+
+  // Scalarize unknown structures.
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  if (!VL0) return;
+
+  unsigned Opcode = VL0->getOpcode();
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If not all of the instructions are identical then we have to scalarize.
+    if (!I || Opcode != I->getOpcode()) return;
+  }
+
+  // Mark instructions with multiple users.
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // Remember to check if all of the users of this instr are vectorized
+    // within our tree.
+    if (I && I->getNumUses() > 1) MultiUserVals.insert(I);
+  }
+
+  for (int i = 0, e = VL.size(); i < e; ++i) {
+    // Check that the instruction is only used within
+    // one lane.
+    if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return;
+    // Make this instruction as 'seen' and remember the lane.
+    LaneMap[VL[i]] = i;
+  }
+
+  switch (Opcode) {
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        getTreeUses_rec(Operands, Depth+1);
+      }
+      return;
+    }
+    case Instruction::Store: {
+      ValueList Operands;
+      for (unsigned j = 0; j < VL.size(); ++j)
+        Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      getTreeUses_rec(Operands, Depth+1);
+      return;
+    }
+    default:
+    return;
+  }
+}
+
+int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
+  Type *ScalarTy = VL[0]->getType();
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+
+  /// Don't mess with vectors.
+  if (ScalarTy->isVectorTy()) return max_cost;
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+  if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy);
+
+  // Check if all of the operands are constants.
+  bool AllConst = true;
+  bool AllSameScalar = true;
+  bool MustScalarizeFlag = false;
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    AllConst &= isa<Constant>(VL[i]);
+    AllSameScalar &= (VL[0] == VL[i]);
+    // Must have a single use.
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    MustScalarizeFlag |= MustScalarize.count(VL[i]);
+    // This instruction is outside the basic block.
+    if (I && I->getParent() != BB)
+      return getScalarizationCost(VecTy);
+  }
+
+  // Is this a simple vector constant.
+  if (AllConst) return 0;
+
+  // If all of the operands are identical we can broadcast them.
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  if (AllSameScalar) {
+    // If we are in a loop, and this is not an instruction (e.g. constant or
+    // argument) or the instruction is defined outside the loop then assume
+    // that the cost is zero.
+    if (L && (!VL0 || !L->contains(VL0)))
+      return 0;
+
+    // We need to broadcast the scalar.
+    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+  }
+
+  // If this is not a constant, or a scalar from outside the loop then we
+  // need to scalarize it.
+  if (MustScalarizeFlag)
+    return getScalarizationCost(VecTy);
+
+  if (!VL0) return getScalarizationCost(VecTy);
+  assert(VL0->getParent() == BB && "Wrong BB");
+
+  unsigned Opcode = VL0->getOpcode();
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If not all of the instructions are identical then we have to scalarize.
+    if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy);
+  }
+
+  // Check if it is safe to sink the loads or the stores.
+  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
+    int MaxIdx = InstrIdx[VL0];
+    for (unsigned i = 1, e = VL.size(); i < e; ++i )
+      MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
+
+    Instruction *Last = InstrVec[MaxIdx];
+    for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
+      if (VL[i] == Last) continue;
+      Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
+      if (Barrier) {
+        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
+              *Last << "\n because of " << *Barrier << "\n");
+        return max_cost;
+      }
+    }
+  }
+
+  switch (Opcode) {
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    int Cost = 0;
+    ValueList Operands;
+    Type *SrcTy = VL0->getOperand(0)->getType();
+    // Prepare the operand vector.
+    for (unsigned j = 0; j < VL.size(); ++j) {
+      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      // Check that the casted type is the same for all users.
+      if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
+        return getScalarizationCost(VecTy);
+    }
+
+    Cost += getTreeCost_rec(Operands, Depth+1);
+    if (Cost >= max_cost) return max_cost;
+
+    // Calculate the cost of this instruction.
+    int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
+                                                       VL0->getType(), SrcTy);
+
+    VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+    int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+    Cost += (VecCost - ScalarCost);
+    return Cost;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    int Cost = 0;
+    // Calculate the cost of all of the operands.
+    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+      ValueList Operands;
+      // Prepare the operand vector.
+      for (unsigned j = 0; j < VL.size(); ++j)
+        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+      Cost += getTreeCost_rec(Operands, Depth+1);
+      if (Cost >= max_cost) return max_cost;
+    }
+
+    // Calculate the cost of this instruction.
+    int ScalarCost = VecTy->getNumElements() *
+      TTI->getArithmeticInstrCost(Opcode, ScalarTy);
+
+    int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+    Cost += (VecCost - ScalarCost);
+    return Cost;
+  }
+  case Instruction::Load: {
+    // If we are scalarize the loads, add the cost of forming the vector.
+    for (unsigned i = 0, e = VL.size()-1; i < e; ++i)
+      if (!isConsecutiveAccess(VL[i], VL[i+1]))
+        return getScalarizationCost(VecTy);
+
+    // Cost of wide load - cost of scalar loads.
+    int ScalarLdCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+    return VecLdCost - ScalarLdCost;
+  }
+  case Instruction::Store: {
+    // We know that we can merge the stores. Calculate the cost.
+    int ScalarStCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0);
+    int StoreCost = VecStCost - ScalarStCost;
+
+    ValueList Operands;
+    for (unsigned j = 0; j < VL.size(); ++j) {
+      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      MemBarrierIgnoreList.insert(VL[j]);
+    }
+
+    int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
+    return TotalCost;
+  }
+  default:
+    // Unable to vectorize unknown instructions.
+    return getScalarizationCost(VecTy);
+  }
+}
+
+Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) {
+  int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
+  for (unsigned i = 0; i < VF; ++i )
+    MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
+  return InstrVec[MaxIdx + 1];
+}
+
+Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
+  IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
+  Value *Vec = UndefValue::get(Ty);
+  for (unsigned i=0; i < Ty->getNumElements(); ++i) {
+    // Generate the 'InsertElement' instruction.
+    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+    // Remember that this instruction is used as part of a 'gather' sequence.
+    // The caller of the bottom-up slp vectorizer can try to hoist the sequence
+    // if the users are outside of the basic block.
+    GatherInstructions.push_back(Vec);
+  }
+
+  return Vec;
+}
+
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
+  Value *V = vectorizeTree_rec(VL, VF);
+  // We moved some instructions around. We have to number them again
+  // before we can do any analysis.
+  numberInstructions();
+  MustScalarize.clear();
+  return V;
+}
+
+Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VF);
+
+  // Check if all of the operands are constants or identical.
+  bool AllConst = true;
+  bool AllSameScalar = true;
+  for (unsigned i = 0, e = VF; i < e; ++i) {
+    AllConst &= isa<Constant>(VL[i]);
+    AllSameScalar &= (VL[0] == VL[i]);
+    // The instruction must be in the same BB, and it must be vectorizable.
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
+      return Scalarize(VL, VecTy);
+  }
+
+  // Check that this is a simple vector constant.
+  if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
+
+  // Scalarize unknown structures.
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  if (!VL0) return Scalarize(VL, VecTy);
+
+  if (VectorizedValues.count(VL0)) return VectorizedValues[VL0];
+
+  unsigned Opcode = VL0->getOpcode();
+  for (unsigned i = 0, e = VF; i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If not all of the instructions are identical then we have to scalarize.
+    if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy);
+  }
+
+  switch (Opcode) {
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    ValueList INVL;
+    for (int i = 0; i < VF; ++i)
+      INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+    Value *InVec = vectorizeTree_rec(INVL, VF);
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    CastInst *CI = dyn_cast<CastInst>(VL0);
+    Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+    VectorizedValues[VL0] = V;
+    return V;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    ValueList LHSVL, RHSVL;
+    for (int i = 0; i < VF; ++i) {
+      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
+    }
+
+    Value *RHS = vectorizeTree_rec(RHSVL, VF);
+    Value *LHS = vectorizeTree_rec(LHSVL, VF);
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
+    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
+    VectorizedValues[VL0] = V;
+    return V;
+  }
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(VL0);
+    unsigned Alignment = LI->getAlignment();
+
+    // Check if all of the loads are consecutive.
+    for (unsigned i = 1, e = VF; i < e; ++i)
+      if (!isConsecutiveAccess(VL[i-1], VL[i]))
+        return Scalarize(VL, VecTy);
+
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+                                          VecTy->getPointerTo());
+    LI = Builder.CreateLoad(VecPtr);
+    LI->setAlignment(Alignment);
+    VectorizedValues[VL0] = LI;
+    return LI;
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(VL0);
+    unsigned Alignment = SI->getAlignment();
+
+    ValueList ValueOp;
+    for (int i = 0; i < VF; ++i)
+      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
+
+    Value *VecValue = vectorizeTree_rec(ValueOp, VF);
+
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
+                                          VecTy->getPointerTo());
+    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
+
+    for (int i = 0; i < VF; ++i)
+      cast<Instruction>(VL[i])->eraseFromParent();
+    return 0;
+  }
+  default:
+    Value *S = Scalarize(VL, VecTy);
+    VectorizedValues[VL0] = S;
+    return S;
+  }
+}
+
+} // end of namespace
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
new file mode 100644
index 0000000..5456c6c
--- /dev/null
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -0,0 +1,164 @@
+//===- VecUtils.h - Vectorization Utilities -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of classes and functions manipulate vectors and chains of
+// vectors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include <vector>
+
+namespace llvm {
+
+class BasicBlock; class Instruction; class Type;
+class VectorType; class StoreInst; class Value;
+class ScalarEvolution; class DataLayout;
+class TargetTransformInfo; class AliasAnalysis;
+class Loop;
+
+/// Bottom Up SLP vectorization utility class.
+struct BoUpSLP  {
+  typedef SmallVector<Value*, 8> ValueList;
+  typedef SmallPtrSet<Value*, 16> ValueSet;
+  typedef SmallVector<StoreInst*, 8> StoreList;
+  static const int max_cost = 1<<20;
+
+  // \brief C'tor.
+  BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
+         TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp);
+
+  /// \brief Take the pointer operand from the Load/Store instruction.
+  /// \returns NULL if this is not a valid Load/Store instruction.
+  static Value *getPointerOperand(Value *I);
+
+  /// \brief Take the address space operand from the Load/Store instruction.
+  /// \returns -1 if this is not a valid Load/Store instruction.
+  static unsigned getAddressSpaceOperand(Value *I);
+
+  /// \returns true if the memory operations A and B are consecutive.
+  bool isConsecutiveAccess(Value *A, Value *B);
+
+  /// \brief Vectorize the tree that starts with the elements in \p VL.
+  /// \returns the vectorized value.
+  Value *vectorizeTree(ArrayRef<Value *> VL, int VF);
+
+  /// \returns the vectorization cost of the subtree that starts at \p VL.
+  /// A negative number means that this is profitable.
+  int getTreeCost(ArrayRef<Value *> VL);
+
+  /// \returns the scalarization cost for this list of values. Assuming that
+  /// this subtree gets vectorized, we may need to extract the values from the
+  /// roots. This method calculates the cost of extracting the values.
+  int getScalarizationCost(ArrayRef<Value *> VL);
+
+  /// \brief Attempts to order and vectorize a sequence of stores. This
+  /// function does a quadratic scan of the given stores.
+  /// \returns true if the basic block was modified.
+  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
+
+  /// \brief Vectorize a group of scalars into a vector tree.
+  void vectorizeArith(ArrayRef<Value *> Operands);
+
+  /// \returns the list of new instructions that were added in order to collect
+  /// scalars into vectors. This list can be used to further optimize the gather
+  /// sequences.
+  ValueList &getGatherSeqInstructions() {return GatherInstructions; }
+
+private:
+  /// \brief This method contains the recursive part of getTreeCost.
+  int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
+
+  /// \brief This recursive method looks for vectorization hazards such as
+  /// values that are used by multiple users and checks that values are used
+  /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
+  void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
+
+  /// \brief This method contains the recursive part of vectorizeTree.
+  Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF);
+
+  /// \brief Number all of the instructions in the block.
+  void numberInstructions();
+
+  ///  \brief Vectorize a sorted sequence of stores.
+  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
+
+  /// \returns the scalarization cost for this type. Scalarization in this
+  /// context means the creation of vectors from a group of scalars.
+  int getScalarizationCost(Type *Ty);
+
+  /// \returns the AA location that is being access by the instruction.
+  AliasAnalysis::Location getLocation(Instruction *I);
+
+  /// \brief Checks if it is possible to sink an instruction from
+  /// \p Src to \p Dst.
+  /// \returns the pointer to the barrier instruction if we can't sink.
+  Value *isUnsafeToSink(Instruction *Src, Instruction *Dst);
+
+  /// \returns the instruction that appears last in the BB from \p VL.
+  /// Only consider the first \p VF elements.
+  Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF);
+
+  /// \returns a vector from a collection of scalars in \p VL.
+  Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty);
+
+private:
+  /// Maps instructions to numbers and back.
+  SmallDenseMap<Value*, int> InstrIdx;
+  /// Maps integers to Instructions.
+  std::vector<Instruction*> InstrVec;
+
+  // -- containers that are used during getTreeCost -- //
+
+  /// Contains values that must be scalarized because they are used
+  /// by multiple lanes, or by users outside the tree.
+  /// NOTICE: The vectorization methods also use this set.
+  ValueSet MustScalarize;
+
+  /// Contains a list of values that are used outside the current tree. This
+  /// set must be reset between runs.
+  ValueSet MultiUserVals;
+  /// Maps values in the tree to the vector lanes that uses them. This map must
+  /// be reset between runs of getCost.
+  std::map<Value*, int> LaneMap;
+  /// A list of instructions to ignore while sinking
+  /// memory instructions. This map must be reset between runs of getCost.
+  SmallPtrSet<Value *, 8> MemBarrierIgnoreList;
+
+  // -- Containers that are used during vectorizeTree -- //
+
+  /// Maps between the first scalar to the vector. This map must be reset
+  ///between runs.
+  DenseMap<Value*, Value*> VectorizedValues;
+
+  // -- Containers that are used after vectorization by the caller -- //
+
+  /// A list of instructions that are used when gathering scalars into vectors.
+  /// In many cases these instructions can be hoisted outside of the BB.
+  /// Iterating over this list is faster than calling LICM.
+  ValueList GatherInstructions;
+
+  // Analysis and block reference.
+  BasicBlock *BB;
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  TargetTransformInfo *TTI;
+  AliasAnalysis *AA;
+  Loop *L;
+};
+
+} // end of namespace
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index 19eefd2..a927fe1 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -1,4 +1,4 @@
-   //===-- Vectorize.cpp -----------------------------------------------------===//
+//===-- Vectorize.cpp -----------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -28,6 +28,7 @@ using namespace llvm;
 void llvm::initializeVectorization(PassRegistry &Registry) {
   initializeBBVectorizePass(Registry);
   initializeLoopVectorizePass(Registry);
+  initializeSLPVectorizerPass(Registry);
 }
 
 void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
@@ -41,3 +42,7 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
 void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopVectorizePass());
 }
+
+void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSLPVectorizerPass());
+}