37 files changed, 1498 insertions, 540 deletions
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 6d950d2..b888e95 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -346,7 +346,7 @@ static bool isLeakCheckerRoot(GlobalVariable *GV) {
 /// Given a value that is stored to a global but never read, determine whether
 /// it's safe to remove the store and the chain of computation that feeds the
 /// store.
-static bool IsSafeComputationToRemove(Value *V) {
+static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
   do {
     if (isa<Constant>(V))
       return true;
@@ -355,7 +355,7 @@ static bool IsSafeComputationToRemove(Value *V) {
     if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
         isa<GlobalValue>(V))
       return false;
-    if (isAllocationFn(V))
+    if (isAllocationFn(V, TLI))
       return true;
 
     Instruction *I = cast<Instruction>(V);
@@ -376,7 +376,8 @@ static bool IsSafeComputationToRemove(Value *V) {
 /// of the global and clean up any that obviously don't assign the global a
 /// value that isn't dynamically allocated.
 ///
-static bool CleanupPointerRootUsers(GlobalVariable *GV) {
+static bool CleanupPointerRootUsers(GlobalVariable *GV,
+                                    const TargetLibraryInfo *TLI) {
   // A brief explanation of leak checkers.  The goal is to find bugs where
   // pointers are forgotten, causing an accumulating growth in memory
   // usage over time.  The common strategy for leak checkers is to whitelist the
@@ -432,18 +433,18 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV) {
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
         Dead.clear();
-        CleanupPointerRootUsers(GV);
+        CleanupPointerRootUsers(GV, TLI);
         return true;
       }
     }
   }
 
   for (int i = 0, e = Dead.size(); i != e; ++i) {
-    if (IsSafeComputationToRemove(Dead[i].first)) {
+    if (IsSafeComputationToRemove(Dead[i].first, TLI)) {
       Dead[i].second->eraseFromParent();
       Instruction *I = Dead[i].first;
       do {
-	if (isAllocationFn(I))
+	if (isAllocationFn(I, TLI))
 	  break;
         Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
         if (!J)
@@ -975,7 +976,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   // nor is the global.
   if (AllNonStoreUsesGone) {
     if (isLeakCheckerRoot(GV)) {
-      Changed |= CleanupPointerRootUsers(GV);
+      Changed |= CleanupPointerRootUsers(GV, TLI);
     } else {
       Changed = true;
       CleanupConstantGlobalUsers(GV, 0, TD, TLI);
@@ -1465,9 +1466,10 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
 /// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break
 /// it up into multiple allocations of arrays of the fields.
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
-                                            Value *NElems, TargetData *TD) {
+                                            Value *NElems, TargetData *TD,
+                                            const TargetLibraryInfo *TLI) {
   DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
-  Type *MAT = getMallocAllocatedType(CI);
+  Type *MAT = getMallocAllocatedType(CI, TLI);
   StructType *STy = cast<StructType>(MAT);
 
   // There is guaranteed to be at least one use of the malloc (storing
@@ -1688,7 +1690,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
   // This eliminates dynamic allocation, avoids an indirection accessing the
   // data, and exposes the resultant global to further GlobalOpt.
   // We cannot optimize the malloc if we cannot determine malloc array size.
-  Value *NElems = getMallocArraySize(CI, TD, true);
+  Value *NElems = getMallocArraySize(CI, TD, TLI, true);
   if (!NElems)
     return false;
 
@@ -1725,7 +1727,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
-    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
+    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
       Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
       unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
@@ -1742,7 +1744,8 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
         CI = cast<CallInst>(Malloc);
     }
 
-    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true), TD);
+    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, TLI, true),
+                               TD, TLI);
     return true;
   }
 
@@ -1771,8 +1774,8 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
       // Optimize away any trapping uses of the loaded value.
       if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, TD, TLI))
         return true;
-    } else if (CallInst *CI = extractMallocCall(StoredOnceVal)) {
-      Type *MallocType = getMallocAllocatedType(CI);
+    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) {
+      Type *MallocType = getMallocAllocatedType(CI, TLI);
       if (MallocType &&
           TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI,
                                              TD, TLI))
@@ -1964,7 +1967,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     bool Changed;
     if (isLeakCheckerRoot(GV)) {
       // Delete any constant stores to the global.
-      Changed = CleanupPointerRootUsers(GV);
+      Changed = CleanupPointerRootUsers(GV, TLI);
     } else {
       // Delete any stores we can find to the global.  We may not be able to
       // make it completely dead though.
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 712888a..69a22fb 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -339,6 +340,7 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraph>();
   const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   SmallPtrSet<Function*, 8> SCCFunctions;
   DEBUG(dbgs() << "Inliner visiting SCC:");
@@ -417,7 +419,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
       // just delete the call instead of trying to inline it, regardless of
       // size.  This happens because IPSCCP propagates the result out of the
       // call and then we're left with the dead call.
-      if (isInstructionTriviallyDead(CS.getInstruction())) {
+      if (isInstructionTriviallyDead(CS.getInstruction(), TLI)) {
         DEBUG(dbgs() << "    -> Deleting dead call: "
                      << *CS.getInstruction() << "\n");
         // Update the call graph by deleting the edge from Callee to Caller.
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index cbe1ca4..b12fc01 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -168,7 +168,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
 /// the heavy lifting.
 ///
 Instruction *InstCombiner::visitCallInst(CallInst &CI) {
-  if (isFreeCall(&CI))
+  if (isFreeCall(&CI, TLI))
     return visitFree(CI);
 
   // If the caller function is nounwind, mark the call as nounwind, even if the
@@ -243,7 +243,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   default: break;
   case Intrinsic::objectsize: {
     uint64_t Size;
-    if (getObjectSize(II->getArgOperand(0), Size, TD))
+    if (getObjectSize(II->getArgOperand(0), Size, TD, TLI))
       return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
     return 0;
   }
@@ -877,7 +877,7 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) {
 // visitCallSite - Improvements for call and invoke instructions.
 //
 Instruction *InstCombiner::visitCallSite(CallSite CS) {
-  if (isAllocLikeFn(CS.getInstruction()))
+  if (isAllocLikeFn(CS.getInstruction(), TLI))
     return visitAllocSite(*CS.getInstruction());
 
   bool Changed = false;
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 35a0bbb..2a7182f 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -462,6 +462,16 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
     }
   }
 
+  // (x lshr C1) udiv C2 --> x udiv (C2 << C1)
+  if (ConstantInt *C2 = dyn_cast<ConstantInt>(Op1)) {
+    Value *X;
+    ConstantInt *C1;
+    if (match(Op0, m_LShr(m_Value(X), m_ConstantInt(C1)))) {
+      APInt NC = C2->getValue().shl(C1->getLimitedValue(C1->getBitWidth()-1));
+      return BinaryOperator::CreateUDiv(X, Builder->getInt(NC));
+    }
+  }
+
   // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
   { const APInt *CI; Value *N;
     if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) ||
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 68ecd51..ff758c4 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1068,7 +1068,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
         if (isa<AllocaInst>(BCI->getOperand(0)) ||
-            isAllocationFn(BCI->getOperand(0))) {
+            isAllocationFn(BCI->getOperand(0), TLI)) {
           // See if the bitcast simplifies, if so, don't nuke this GEP yet.
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
@@ -1107,7 +1107,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
 
 static bool
-isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users) {
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
+                     const TargetLibraryInfo *TLI) {
   SmallVector<Instruction*, 4> Worklist;
   Worklist.push_back(AI);
 
@@ -1163,7 +1164,7 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users) {
           }
         }
 
-        if (isFreeCall(I)) {
+        if (isFreeCall(I, TLI)) {
           Users.push_back(I);
           continue;
         }
@@ -1188,7 +1189,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
   // to null and free calls, delete the calls and replace the comparisons with
   // true or false as appropriate.
   SmallVector<WeakVH, 64> Users;
-  if (isAllocSiteRemovable(&MI, Users)) {
+  if (isAllocSiteRemovable(&MI, Users, TLI)) {
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
       Instruction *I = cast_or_null<Instruction>(&*Users[i]);
       if (!I) continue;
@@ -1872,7 +1873,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       Instruction *Inst = BBI++;
 
       // DCE instruction if trivially dead.
-      if (isInstructionTriviallyDead(Inst)) {
+      if (isInstructionTriviallyDead(Inst, TLI)) {
         ++NumDeadInst;
         DEBUG(errs() << "IC: DCE: " << *Inst << '\n');
         Inst->eraseFromParent();
@@ -2002,7 +2003,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     if (I == 0) continue;  // skip null values.
 
     // Check to see if we can DCE the instruction.
-    if (isInstructionTriviallyDead(I)) {
+    if (isInstructionTriviallyDead(I, TLI)) {
       DEBUG(errs() << "IC: DCE: " << *I << '\n');
       EraseInstFromFunction(*I);
       ++NumDeadInst;
@@ -2102,7 +2103,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
         // If the instruction was modified, it's possible that it is now dead.
         // if so, remove it.
-        if (isInstructionTriviallyDead(I)) {
+        if (isInstructionTriviallyDead(I, TLI)) {
           EraseInstFromFunction(*I);
         } else {
           Worklist.Add(I);
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 06f4d2f..0775cf4 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -15,7 +15,7 @@
 
 #define DEBUG_TYPE "asan"
 
-#include "FunctionBlackList.h"
+#include "BlackList.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
@@ -217,7 +217,7 @@ struct AddressSanitizer : public ModulePass {
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Instruction *CtorInsertBefore;
-  OwningPtr<FunctionBlackList> BL;
+  OwningPtr<BlackList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
   InlineAsm *EmptyAsm;
@@ -544,6 +544,7 @@ bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
   Type *Ty = cast<PointerType>(G->getType())->getElementType();
   DEBUG(dbgs() << "GLOBAL: " << *G);
 
+  if (BL->isIn(*G)) return false;
   if (!Ty->isSized()) return false;
   if (!G->hasInitializer()) return false;
   // Touch only those globals that will not be defined in other modules.
@@ -643,6 +644,8 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
     // Determine whether this global should be poisoned in initialization.
     bool GlobalHasDynamicInitializer = HasDynamicInitializer(G);
+    // Don't check initialization order if this global is blacklisted.
+    GlobalHasDynamicInitializer &= !BL->isInInit(*G);
 
     StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
     Constant *NewInitializer = ConstantStruct::get(
@@ -736,7 +739,7 @@ bool AddressSanitizer::runOnModule(Module &M) {
   TD = getAnalysisIfAvailable<TargetData>();
   if (!TD)
     return false;
-  BL.reset(new FunctionBlackList(ClBlackListFile));
+  BL.reset(new BlackList(ClBlackListFile));
 
   C = &(M.getContext());
   LongSize = TD->getPointerSizeInBits();
@@ -774,7 +777,7 @@ bool AddressSanitizer::runOnModule(Module &M) {
                             /*hasSideEffects=*/true);
 
   llvm::Triple targetTriple(M.getTargetTriple());
-  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::ANDROIDEABI;
+  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::Android;
 
   MappingOffset = isAndroid ? kDefaultShadowOffsetAndroid :
     (LongSize == 32 ? kDefaultShadowOffset32 : kDefaultShadowOffset64);
diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
new file mode 100644
index 0000000..2cb1199
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BlackList.cpp
@@ -0,0 +1,102 @@
+//===-- BlackList.cpp - blacklist for sanitizers --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions or global
+// variables based on a user-supplied blacklist.
+//
+//===----------------------------------------------------------------------===//
+
+#include <utility>
+#include <string>
+
+#include "BlackList.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+
+namespace llvm {
+
+BlackList::BlackList(const StringRef Path) {
+  // Validate and open blacklist file.
+  if (!Path.size()) return;
+  OwningPtr<MemoryBuffer> File;
+  if (error_code EC = MemoryBuffer::getFile(Path, File)) {
+    report_fatal_error("Can't open blacklist file: " + Path + ": " +
+                       EC.message());
+  }
+
+  // Iterate through each line in the blacklist file.
+  SmallVector<StringRef, 16> Lines;
+  SplitString(File.take()->getBuffer(), Lines, "\n\r");
+  StringMap<std::string> Regexps;
+  for (SmallVector<StringRef, 16>::iterator I = Lines.begin(), E = Lines.end();
+       I != E; ++I) {
+    // Get our prefix and unparsed regexp.
+    std::pair<StringRef, StringRef> SplitLine = I->split(":");
+    StringRef Prefix = SplitLine.first;
+    std::string Regexp = SplitLine.second;
+
+    // Replace * with .*
+    for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos;
+         pos += strlen(".*")) {
+      Regexp.replace(pos, strlen("*"), ".*");
+    }
+
+    // Check that the regexp is valid.
+    Regex CheckRE(Regexp);
+    std::string Error;
+    if (!CheckRE.isValid(Error)) {
+      report_fatal_error("malformed blacklist regex: " + SplitLine.second +
+          ": " + Error);
+    }
+
+    // Add this regexp into the proper group by its prefix.
+    if (Regexps[Prefix].size())
+      Regexps[Prefix] += "|";
+    Regexps[Prefix] += Regexp;
+  }
+
+  // Iterate through each of the prefixes, and create Regexs for them.
+  for (StringMap<std::string>::iterator I = Regexps.begin(), E = Regexps.end();
+       I != E; ++I) {
+    Entries[I->getKey()] = new Regex(I->getValue());
+  }
+}
+
+bool BlackList::isIn(const Function &F) {
+  return isIn(*F.getParent()) || inSection("fun", F.getName());
+}
+
+bool BlackList::isIn(const GlobalVariable &G) {
+  return isIn(*G.getParent()) || inSection("global", G.getName());
+}
+
+bool BlackList::isIn(const Module &M) {
+  return inSection("src", M.getModuleIdentifier());
+}
+
+bool BlackList::isInInit(const GlobalVariable &G) {
+  return isIn(*G.getParent()) || inSection("global-init", G.getName());
+}
+
+bool BlackList::inSection(const StringRef Section,
+                                  const StringRef Query) {
+  Regex *FunctionRegex = Entries[Section];
+  return FunctionRegex ? FunctionRegex->match(Query) : false;
+}
+
+}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/BlackList.h b/lib/Transforms/Instrumentation/BlackList.h
new file mode 100644
index 0000000..73977fc
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BlackList.h
@@ -0,0 +1,55 @@
+//===-- BlackList.h - blacklist for sanitizers ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions or global
+// variables based on a user-supplied blacklist.
+//
+// The blacklist disables instrumentation of various functions and global
+// variables.  Each line contains a prefix, followed by a wild card expression.
+// ---
+// fun:*_ZN4base6subtle*
+// global:*global_with_bad_access_or_initialization*
+// global-init:*global_with_initialization_issues*
+// src:file_with_tricky_code.cc
+// ---
+// Note that the wild card is in fact an llvm::Regex, but * is automatically
+// replaced with .*
+// This is similar to the "ignore" feature of ThreadSanitizer.
+// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "llvm/ADT/StringMap.h"
+
+namespace llvm {
+class Function;
+class GlobalVariable;
+class Module;
+class Regex;
+class StringRef;
+
+class BlackList {
+ public:
+  BlackList(const StringRef Path);
+  // Returns whether either this function or it's source file are blacklisted.
+  bool isIn(const Function &F);
+  // Returns whether either this global or it's source file are blacklisted.
+  bool isIn(const GlobalVariable &G);
+  // Returns whether this module is blacklisted by filename.
+  bool isIn(const Module &M);
+  // Returns whether a global should be excluded from initialization checking.
+  bool isInInit(const GlobalVariable &G);
+ private:
+  StringMap<Regex*> Entries;
+
+  bool inSection(const StringRef Section, const StringRef Query);
+};
+
+}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 09e0f14..6429081 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/TargetFolder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Instrumentation.h"
 using namespace llvm;
 
@@ -48,10 +49,12 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<TargetData>();
+      AU.addRequired<TargetLibraryInfo>();
     }
 
   private:
     const TargetData *TD;
+    const TargetLibraryInfo *TLI;
     ObjectSizeOffsetEvaluator *ObjSizeEval;
     BuilderTy *Builder;
     Instruction *Inst;
@@ -166,11 +169,12 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
 
 bool BoundsChecking::runOnFunction(Function &F) {
   TD = &getAnalysis<TargetData>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
 
   TrapBB = 0;
   BuilderTy TheBuilder(F.getContext(), TargetFolder(TD));
   Builder = &TheBuilder;
-  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, F.getContext());
+  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext());
   ObjSizeEval = &TheObjSizeEval;
 
   // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 00de882..058f68c 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,8 +1,8 @@
 add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
+  BlackList.cpp
   BoundsChecking.cpp
   EdgeProfiling.cpp
-  FunctionBlackList.cpp
   GCOVProfiling.cpp
   Instrumentation.cpp
   OptimalEdgeProfiling.cpp
diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.cpp b/lib/Transforms/Instrumentation/FunctionBlackList.cpp
deleted file mode 100644
index 188ea4d..0000000
--- a/lib/Transforms/Instrumentation/FunctionBlackList.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//===-- FunctionBlackList.cpp - blacklist of functions --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer 
-// or ThreadSanitizer) to avoid instrumenting some functions based on
-// user-supplied blacklist.
-//
-//===----------------------------------------------------------------------===//
-
-#include "FunctionBlackList.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Function.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
-
-namespace llvm {
-
-FunctionBlackList::FunctionBlackList(const std::string &Path) {
-  Functions = NULL;
-  const char *kFunPrefix = "fun:";
-  if (!Path.size()) return;
-  std::string Fun;
-
-  OwningPtr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFile(Path.c_str(), File)) {
-    report_fatal_error("Can't open blacklist file " + Path + ": " +
-                       EC.message());
-  }
-  MemoryBuffer *Buff = File.take();
-  const char *Data = Buff->getBufferStart();
-  size_t DataLen = Buff->getBufferSize();
-  SmallVector<StringRef, 16> Lines;
-  SplitString(StringRef(Data, DataLen), Lines, "\n\r");
-  for (size_t i = 0, numLines = Lines.size(); i < numLines; i++) {
-    if (Lines[i].startswith(kFunPrefix)) {
-      std::string ThisFunc = Lines[i].substr(strlen(kFunPrefix));
-      std::string ThisFuncRE;
-      // add ThisFunc replacing * with .*
-      for (size_t j = 0, n = ThisFunc.size(); j < n; j++) {
-        if (ThisFunc[j] == '*')
-          ThisFuncRE += '.';
-        ThisFuncRE += ThisFunc[j];
-      }
-      // Check that the regexp is valid.
-      Regex CheckRE(ThisFuncRE);
-      std::string Error;
-      if (!CheckRE.isValid(Error))
-        report_fatal_error("malformed blacklist regex: " + ThisFunc +
-                           ": " + Error);
-      // Append to the final regexp.
-      if (Fun.size())
-        Fun += "|";
-      Fun += ThisFuncRE;
-    }
-  }
-  if (Fun.size()) {
-    Functions = new Regex(Fun);
-  }
-}
-
-bool FunctionBlackList::isIn(const Function &F) {
-  if (Functions) {
-    bool Res = Functions->match(F.getName());
-    return Res;
-  }
-  return false;
-}
-
-}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.h b/lib/Transforms/Instrumentation/FunctionBlackList.h
deleted file mode 100644
index c1239b9..0000000
--- a/lib/Transforms/Instrumentation/FunctionBlackList.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- FunctionBlackList.cpp - blacklist of functions ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer
-// or ThreadSanitizer) to avoid instrumenting some functions based on
-// user-supplied blacklist.
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include <string>
-
-namespace llvm {
-class Function;
-class Regex;
-
-// Blacklisted functions are not instrumented.
-// The blacklist file contains one or more lines like this:
-// ---
-// fun:FunctionWildCard
-// ---
-// This is similar to the "ignore" feature of ThreadSanitizer.
-// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
-class FunctionBlackList {
- public:
-  FunctionBlackList(const std::string &Path);
-  bool isIn(const Function &F);
- private:
-  Regex *Functions;
-};
-
-}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 264a6a6..9fcde31 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -88,11 +88,10 @@ namespace {
 
     // Add the function to write out all our counters to the global destructor
     // list.
-    void insertCounterWriteout(SmallVector<std::pair<GlobalVariable *,
-                                                     MDNode *>, 8> &);
+    void insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
     void insertIndirectCounterIncrement();
 
-    std::string mangleName(DICompileUnit CU, std::string NewStem);
+    std::string mangleName(DICompileUnit CU, const char *NewStem);
 
     bool EmitNotes;
     bool EmitData;
@@ -329,7 +328,7 @@ namespace {
   };
 }
 
-std::string GCOVProfiler::mangleName(DICompileUnit CU, std::string NewStem) {
+std::string GCOVProfiler::mangleName(DICompileUnit CU, const char *NewStem) {
   if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
     for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
       MDNode *N = GCov->getOperand(i);
@@ -630,7 +629,7 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() {
 }
 
 void GCOVProfiler::insertCounterWriteout(
-    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &CountersBySP) {
+    ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
   FunctionType *WriteoutFTy =
       FunctionType::get(Type::getVoidTy(*Ctx), false);
   Function *WriteoutF = Function::Create(WriteoutFTy,
@@ -652,7 +651,7 @@ void GCOVProfiler::insertCounterWriteout(
       std::string FilenameGcda = mangleName(compile_unit, "gcda");
       Builder.CreateCall(StartFile,
                          Builder.CreateGlobalStringPtr(FilenameGcda));
-      for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator
+      for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
              I = CountersBySP.begin(), E = CountersBySP.end();
            I != E; ++I) {
         DISubprogram SP(I->second);
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index dc0fa71..17b7775 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -21,7 +21,7 @@
 
 #define DEBUG_TYPE "tsan"
 
-#include "FunctionBlackList.h"
+#include "BlackList.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
@@ -50,7 +50,7 @@ static cl::opt<std::string>  ClBlackListFile("tsan-blacklist",
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
-STATISTIC(NumOmittedReadsBeforeWrite, 
+STATISTIC(NumOmittedReadsBeforeWrite,
           "Number of reads ignored due to following writes");
 STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
 STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
@@ -77,7 +77,7 @@ struct ThreadSanitizer : public FunctionPass {
   int getMemoryAccessFuncIndex(Value *Addr);
 
   TargetData *TD;
-  OwningPtr<FunctionBlackList> BL;
+  OwningPtr<BlackList> BL;
   IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
   Function *TsanFuncEntry;
@@ -121,7 +121,7 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<TargetData>();
   if (!TD)
     return false;
-  BL.reset(new FunctionBlackList(ClBlackListFile));
+  BL.reset(new BlackList(ClBlackListFile));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -186,7 +186,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
       NumOmittedReadsFromConstantGlobals++;
       return true;
     }
-  } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
     if (isVtableAccess(L)) {
       // Reads from a vtable pointer can not race with any writes.
       NumOmittedReadsFromVtable++;
@@ -344,7 +344,7 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
     case NotAtomic:              assert(false);
     case Unordered:              // Fall-through.
     case Monotonic:              v = 1 << 0; break;
- // case Consume:                v = 1 << 1; break;  // Not specified yet.
+    // case Consume:                v = 1 << 1; break;  // Not specified yet.
     case Acquire:                v = 1 << 2; break;
     case Release:                v = 1 << 3; break;
     case AcquireRelease:         v = 1 << 4; break;
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index a8deda8..5912107 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Transforms/Utils/AddrModeMatcher.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -148,7 +149,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   PFI = getAnalysisIfAvailable<ProfileInfo>();
   OptSize = F.hasFnAttr(Attribute::OptimizeForSize);
 
-  // First pass, eliminate blocks that contain only PHI nodes and an
+  /// This optimization identifies DIV instructions that can be
+  /// profitably bypassed and carried out with a shorter, faster divide.
+  if (TLI && TLI->isSlowDivBypassed()) {
+    const DenseMap<Type *, Type *> &BypassTypeMap = TLI->getBypassSlowDivTypes();
+
+    for (Function::iterator I = F.begin(); I != F.end(); I++) {
+      EverMadeChange |= bypassSlowDivision(F,
+                                           I,
+                                           BypassTypeMap);
+    }
+  }
+
+  // Eliminate blocks that contain only PHI nodes and an
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
 
@@ -988,7 +1001,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     WeakVH IterHandle(CurInstIterator);
     BasicBlock *BB = CurInstIterator->getParent();
 
-    RecursivelyDeleteTriviallyDeadInstructions(Repl);
+    RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
 
     if (IterHandle != CurInstIterator) {
       // If the iterator instruction was recursively deleted, start over at the
@@ -1174,17 +1187,32 @@ static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
 }
 
 
+/// If we have a SelectInst that will likely profit from branch prediction,
+/// turn it into a branch.
 bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
-  // If we have a SelectInst that will likely profit from branch prediction,
-  // turn it into a branch.
-  if (DisableSelectToBranch || OptSize || !TLI ||
-      !TLI->isPredictableSelectExpensive())
-    return false;
+  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
 
-  if (!SI->getCondition()->getType()->isIntegerTy(1) ||
-      !isFormingBranchFromSelectProfitable(SI))
+  // Can we convert the 'select' to CF ?
+  if (DisableSelectToBranch || OptSize || !TLI || VectorCond)
     return false;
 
+  TargetLowering::SelectSupportKind SelectKind;
+  if (VectorCond)
+    SelectKind = TargetLowering::VectorMaskSelect;
+  else if (SI->getType()->isVectorTy())
+    SelectKind = TargetLowering::ScalarCondVectorVal;
+  else
+    SelectKind = TargetLowering::ScalarValSelect;
+
+  // Do we have efficient codegen support for this kind of 'selects' ?
+  if (TLI->isSelectSupported(SelectKind)) {
+    // We have efficient codegen support for the select instruction.
+    // Check if it is profitable to keep this 'select'.
+    if (!TLI->isPredictableSelectExpensive() ||
+        !isFormingBranchFromSelectProfitable(SI))
+      return false;
+  }
+
   ModifiedDT = true;
 
   // First, we split the block containing the select into 2 blocks.
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 8dbcc23..086f0a1 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Instruction.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
@@ -38,10 +39,11 @@ namespace {
       initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
     }
     virtual bool runOnBasicBlock(BasicBlock &BB) {
+      TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
         Instruction *Inst = DI++;
-        if (isInstructionTriviallyDead(Inst)) {
+        if (isInstructionTriviallyDead(Inst, TLI)) {
           Inst->eraseFromParent();
           Changed = true;
           ++DIEEliminated;
@@ -87,6 +89,8 @@ char DCE::ID = 0;
 INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
 
 bool DCE::runOnFunction(Function &F) {
+  TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
   // Start out with all of the instructions in the worklist...
   std::vector<Instruction*> WorkList;
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i)
@@ -101,7 +105,7 @@ bool DCE::runOnFunction(Function &F) {
     Instruction *I = WorkList.back();
     WorkList.pop_back();
 
-    if (isInstructionTriviallyDead(I)) {       // If the instruction is dead.
+    if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead.
       // Loop over all of the values that the instruction uses, if there are
       // instructions being used, add them to the worklist, because they might
       // go dead after this one is removed.
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 8b1283f..1ff4329 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -106,6 +106,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 ///
 static void DeleteDeadInstruction(Instruction *I,
                                   MemoryDependenceAnalysis &MD,
+                                  const TargetLibraryInfo *TLI,
                                   SmallSetVector<Value*, 16> *ValueSet = 0) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
@@ -130,7 +131,7 @@ static void DeleteDeadInstruction(Instruction *I,
       if (!Op->use_empty()) continue;
 
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           NowDeadInsts.push_back(OpI);
     }
 
@@ -276,7 +277,7 @@ static Value *getStoredPointerOperand(Instruction *I) {
 
 static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
   uint64_t Size;
-  if (getObjectSize(V, Size, AA.getTargetData()))
+  if (getObjectSize(V, Size, AA.getTargetData(), AA.getTargetLibraryInfo()))
     return Size;
   return AliasAnalysis::UnknownSize;
 }
@@ -454,7 +455,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     Instruction *Inst = BBI++;
 
     // Handle 'free' calls specially.
-    if (CallInst *F = isFreeCall(Inst)) {
+    if (CallInst *F = isFreeCall(Inst, AA->getTargetLibraryInfo())) {
       MadeChange |= HandleFree(F);
       continue;
     }
@@ -483,7 +484,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           // in case we need it.
           WeakVH NextInst(BBI);
 
-          DeleteDeadInstruction(SI, *MD);
+          DeleteDeadInstruction(SI, *MD, AA->getTargetLibraryInfo());
 
           if (NextInst == 0)  // Next instruction deleted.
             BBI = BB.begin();
@@ -530,7 +531,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
 
           // Delete the store and now-dead instructions that feed it.
-          DeleteDeadInstruction(DepWrite, *MD);
+          DeleteDeadInstruction(DepWrite, *MD, AA->getTargetLibraryInfo());
           ++NumFastStores;
           MadeChange = true;
 
@@ -640,7 +641,7 @@ bool DSE::HandleFree(CallInst *F) {
       Instruction *Next = llvm::next(BasicBlock::iterator(Dependency));
 
       // DCE instructions only used to calculate that store
-      DeleteDeadInstruction(Dependency, *MD);
+      DeleteDeadInstruction(Dependency, *MD, AA->getTargetLibraryInfo());
       ++NumFastStores;
       MadeChange = true;
 
@@ -680,7 +681,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
     // Okay, so these are dead heap objects, but if the pointer never escapes
     // then it's leaked by this function anyways.
-    else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true))
+    else if (isAllocLikeFn(I, AA->getTargetLibraryInfo()) &&
+             !PointerMayBeCaptured(I, true, true))
       DeadStackObjects.insert(I);
   }
 
@@ -724,7 +726,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
               dbgs() << '\n');
 
         // DCE instructions only used to calculate that store.
-        DeleteDeadInstruction(Dead, *MD, &DeadStackObjects);
+        DeleteDeadInstruction(Dead, *MD, AA->getTargetLibraryInfo(),
+                              &DeadStackObjects);
         ++NumFastStores;
         MadeChange = true;
         continue;
@@ -732,9 +735,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     }
 
     // Remove any dead non-memory-mutating instructions.
-    if (isInstructionTriviallyDead(BBI)) {
+    if (isInstructionTriviallyDead(BBI, AA->getTargetLibraryInfo())) {
       Instruction *Inst = BBI++;
-      DeleteDeadInstruction(Inst, *MD, &DeadStackObjects);
+      DeleteDeadInstruction(Inst, *MD, AA->getTargetLibraryInfo(),
+                            &DeadStackObjects);
       ++NumFastOther;
       MadeChange = true;
       continue;
@@ -750,7 +754,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     if (CallSite CS = cast<Value>(BBI)) {
       // Remove allocation function calls from the list of dead stack objects; 
       // there can't be any references before the definition.
-      if (isAllocLikeFn(BBI))
+      if (isAllocLikeFn(BBI, AA->getTargetLibraryInfo()))
         DeadStackObjects.remove(BBI);
 
       // If this call does not access memory, it can't be loading any of our
@@ -771,15 +775,15 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
           LiveAllocas.push_back(*I);
       }
 
-      for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
-           E = LiveAllocas.end(); I != E; ++I)
-        DeadStackObjects.remove(*I);
-
       // If all of the allocas were clobbered by the call then we're not going
       // to find anything else to process.
-      if (DeadStackObjects.empty())
+      if (DeadStackObjects.size() == LiveAllocas.size())
         break;
 
+      for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
+           E = LiveAllocas.end(); I != E; ++I)
+        DeadStackObjects.remove(*I);
+
       continue;
     }
 
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 9759549..2627113 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -374,7 +374,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     Instruction *Inst = I++;
 
     // Dead instructions should just be removed.
-    if (isInstructionTriviallyDead(Inst)) {
+    if (isInstructionTriviallyDead(Inst, TLI)) {
       DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
       Inst->eraseFromParent();
       Changed = true;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 4822fd0..16ae6ad 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -271,16 +271,16 @@ void ValueTable::add(Value *V, uint32_t num) {
   valueNumbering.insert(std::make_pair(V, num));
 }
 
-uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
+uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
     Expression exp = create_expression(C);
-    uint32_t& e = expressionNumbering[exp];
+    uint32_t &e = expressionNumbering[exp];
     if (!e) e = nextValueNumber++;
     valueNumbering[C] = e;
     return e;
   } else if (AA->onlyReadsMemory(C)) {
     Expression exp = create_expression(C);
-    uint32_t& e = expressionNumbering[exp];
+    uint32_t &e = expressionNumbering[exp];
     if (!e) {
       e = nextValueNumber++;
       valueNumbering[C] = e;
@@ -413,7 +413,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::And:
-    case Instruction::Or :
+    case Instruction::Or:
     case Instruction::Xor:
     case Instruction::ICmp:
     case Instruction::FCmp:
@@ -632,6 +632,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
 
+#ifndef NDEBUG
 void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
@@ -641,6 +642,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   }
   errs() << "}\n";
 }
+#endif
 
 /// IsValueFullyAvailableInBlock - Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
@@ -1436,7 +1438,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     Instruction *DepInst = DepInfo.getInst();
 
     // Loading the allocation -> undef.
-    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) ||
+    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
         // Loading immediately after lifetime begin -> undef.
         isLifetimeStart(DepInst)) {
       ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
@@ -1951,7 +1953,7 @@ bool GVN::processLoad(LoadInst *L) {
   // If this load really doesn't depend on anything, then we must be loading an
   // undef value.  This can happen when loading for a fresh allocation with no
   // intervening stores, for example.
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) {
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
     L->replaceAllUsesWith(UndefValue::get(L->getType()));
     markInstructionForDeletion(L);
     ++NumGVNLoad;
@@ -2231,12 +2233,20 @@ bool GVN::processInstruction(Instruction *I) {
     Value *SwitchCond = SI->getCondition();
     BasicBlock *Parent = SI->getParent();
     bool Changed = false;
+
+    // Remember how many outgoing edges there are to every successor.
+    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+    for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+      ++SwitchEdges[SI->getSuccessor(i)];
+
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
       BasicBlock *Dst = i.getCaseSuccessor();
-      BasicBlockEdge E(Parent, Dst);
-      if (E.isSingleEdge())
+      // If there is only a single edge, propagate the case value into it.
+      if (SwitchEdges.lookup(Dst) == 1) {
+        BasicBlockEdge E(Parent, Dst);
         Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E);
+      }
     }
     return Changed;
   }
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 37f8bdf..c933a17 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -68,6 +69,7 @@ namespace {
     ScalarEvolution *SE;
     DominatorTree   *DT;
     TargetData      *TD;
+    TargetLibraryInfo *TLI;
 
     SmallVector<WeakVH, 16> DeadInsts;
     bool Changed;
@@ -414,11 +416,11 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // new comparison.
   NewCompare->takeName(Compare);
   Compare->replaceAllUsesWith(NewCompare);
-  RecursivelyDeleteTriviallyDeadInstructions(Compare);
+  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
 
   // Delete the old floating point increment.
   Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
-  RecursivelyDeleteTriviallyDeadInstructions(Incr);
+  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
 
   // If the FP induction variable still has uses, this is because something else
   // in the loop uses its value.  In order to canonicalize the induction
@@ -431,7 +433,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
     Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
                                  PN->getParent()->getFirstInsertionPt());
     PN->replaceAllUsesWith(Conv);
-    RecursivelyDeleteTriviallyDeadInstructions(PN);
+    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
   }
   Changed = true;
 }
@@ -550,14 +552,14 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         PN->setIncomingValue(i, ExitVal);
 
         // If this instruction is dead now, delete it.
-        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+        RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
 
         if (NumPreds == 1) {
           // Completely replace a single-pred PHI. This is safe, because the
           // NewVal won't be variant in the loop, so we don't need an LCSSA phi
           // node anymore.
           PN->replaceAllUsesWith(ExitVal);
-          RecursivelyDeleteTriviallyDeadInstructions(PN);
+          RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
         }
       }
       if (NumPreds != 1) {
@@ -1697,6 +1699,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTree>();
   TD = getAnalysisIfAvailable<TargetData>();
+  TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   DeadInsts.clear();
   Changed = false;
@@ -1763,7 +1766,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   while (!DeadInsts.empty())
     if (Instruction *Inst =
           dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
-      RecursivelyDeleteTriviallyDeadInstructions(Inst);
+      RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
 
   // The Rewriter may not be used from this point on.
 
@@ -1772,7 +1775,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   SinkUnusedInvariants(L);
 
   // Clean up dead instructions.
-  Changed |= DeleteDeadPHIs(L->getHeader());
+  Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
   // Check a post-condition.
   assert(L->isLCSSAForm(*DT) &&
          "Indvars did not leave the loop in lcssa form!");
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index dd42c59..20844c6 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1455,7 +1455,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   // At this point, the IR is fully up to date and consistent.  Do a quick scan
   // over the new instructions and zap any that are constants or dead.  This
   // frequently happens because of phi translation.
-  SimplifyInstructionsInBlock(NewBB, TD);
+  SimplifyInstructionsInBlock(NewBB, TD, TLI);
 
   // Threaded an edge!
   ++NumThreads;
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 0192e92..99bedce 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -108,6 +108,9 @@ namespace {
     BasicBlock *Preheader;   // The preheader block of the current loop...
     Loop *CurLoop;           // The current loop we are working on...
     AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    bool MayThrow;           // The current loop contains an instruction which
+                             // may throw, thus preventing code motion of
+                             // instructions with side effects.
     DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
 
     /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
@@ -240,6 +243,15 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
       CurAST->add(*BB);                 // Incorporate the specified basic block
   }
 
+  MayThrow = false;
+  // TODO: We've already searched for instructions which may throw in subloops.
+  // We may want to reuse this information.
+  for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end();
+       (BB != BBE) && !MayThrow ; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+         (I != E) && !MayThrow; ++I)
+      MayThrow |= I->mayThrow();
+
   // We want to visit all of the instructions in this loop... that are not parts
   // of our subloops (they have already had their invariants hoisted out of
   // their loop, into this loop, so there is no need to process the BODIES of
@@ -307,7 +319,7 @@ void LICM::SinkRegion(DomTreeNode *N) {
 
     // If the instruction is dead, we would try to sink it because it isn't used
     // in the loop, instead, just delete it.
-    if (isInstructionTriviallyDead(&I)) {
+    if (isInstructionTriviallyDead(&I, TLI)) {
       DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
       ++II;
       CurAST->deleteValue(&I);
@@ -418,17 +430,22 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
       if (!FoundMod) return true;
     }
 
-    // FIXME: This should use mod/ref information to see if we can hoist or sink
-    // the call.
+    // FIXME: This should use mod/ref information to see if we can hoist or
+    // sink the call.
 
     return false;
   }
 
-  // Otherwise these instructions are hoistable/sinkable
-  return isa<BinaryOperator>(I) || isa<CastInst>(I) ||
-         isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
-         isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
-         isa<ShuffleVectorInst>(I);
+  // Only these instructions are hoistable/sinkable.
+  bool HoistableKind = (isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+                            isa<SelectInst>(I) || isa<GetElementPtrInst>(I) ||
+                            isa<CmpInst>(I)    || isa<InsertElementInst>(I) ||
+                            isa<ExtractElementInst>(I) ||
+                            isa<ShuffleVectorInst>(I));
+  if (!HoistableKind)
+      return false;
+
+  return isSafeToExecuteUnconditionally(I);
 }
 
 /// isNotUsedInLoop - Return true if the only users of this instruction are
@@ -604,6 +621,12 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
 }
 
 bool LICM::isGuaranteedToExecute(Instruction &Inst) {
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (MayThrow)
+    return false;
+
   // Otherwise we have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index ac1082c..a72e288 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -132,7 +132,8 @@ Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
 /// and zero out all the operands of this instruction.  If any of them become
 /// dead, delete them and the computation tree that feeds them.
 ///
-static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
+static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
+                                  const TargetLibraryInfo *TLI) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
@@ -153,7 +154,7 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
       if (!Op->use_empty()) continue;
 
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           NowDeadInsts.push_back(OpI);
     }
 
@@ -164,10 +165,11 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
 
 /// deleteIfDeadInstruction - If the specified value is a dead instruction,
 /// delete it and any recursively used instructions.
-static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) {
+static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
+                                    const TargetLibraryInfo *TLI) {
   if (Instruction *I = dyn_cast<Instruction>(V))
-    if (isInstructionTriviallyDead(I))
-      deleteDeadInstruction(I, SE);
+    if (isInstructionTriviallyDead(I, TLI))
+      deleteDeadInstruction(I, SE, TLI);
 }
 
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
@@ -490,7 +492,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                             StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(BasePtr, *SE);
+    deleteIfDeadInstruction(BasePtr, *SE, TLI);
     return false;
   }
 
@@ -538,7 +540,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(TheStore, *SE);
+  deleteDeadInstruction(TheStore, *SE, TLI);
   ++NumMemSet;
   return true;
 }
@@ -579,7 +581,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
                             getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
     return false;
   }
 
@@ -594,8 +596,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
                             StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(LoadBasePtr, *SE);
-    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    deleteIfDeadInstruction(LoadBasePtr, *SE, TLI);
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
     return false;
   }
 
@@ -628,7 +630,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(SI, *SE);
+  deleteDeadInstruction(SI, *SE, TLI);
   ++NumMemCpy;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 982400c..f5daa7b 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -120,7 +120,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
             ++NumSimplified;
           }
         }
-        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I);
+        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
 
         if (IsSubloopHeader && !isa<PHINode>(I))
           break;
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 7eeb152..abe07aa 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
@@ -256,6 +257,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
     return false;
 
   BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
 
   BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
   if (BI == 0 || BI->isUnconditional())
@@ -267,13 +269,9 @@ bool LoopRotate::rotateLoop(Loop *L) {
   if (!L->isLoopExiting(OrigHeader))
     return false;
 
-  // Updating PHInodes in loops with multiple exits adds complexity.
-  // Keep it simple, and restrict loop rotation to loops with one exit only.
-  // In future, lift this restriction and support for multiple exits if
-  // required.
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() > 1)
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (OrigLatch == 0 || L->isLoopExiting(OrigLatch))
     return false;
 
   // Check size of original header and reject loop if it is very big.
@@ -286,11 +284,10 @@ bool LoopRotate::rotateLoop(Loop *L) {
 
   // Now, this loop is suitable for rotation.
   BasicBlock *OrigPreheader = L->getLoopPreheader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
 
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
-  if (OrigPreheader == 0 || OrigLatch == 0)
+  if (OrigPreheader == 0)
     return false;
 
   // Anything ScalarEvolution may know about this loop or the PHI nodes
@@ -298,6 +295,8 @@ bool LoopRotate::rotateLoop(Loop *L) {
   if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>())
     SE->forgetLoop(L);
 
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
   // Find new Loop header. NewHeader is a Header's one and only successor
   // that is inside loop.  Header's other successor is outside the
   // loop.  Otherwise loop is not suitable for rotation.
@@ -408,10 +407,19 @@ bool LoopRotate::rotateLoop(Loop *L) {
     // Update DominatorTree to reflect the CFG change we just made.  Then split
     // edges as necessary to preserve LoopSimplify form.
     if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
-      // Since OrigPreheader now has the conditional branch to Exit block, it is
-      // the dominator of Exit.
-      DT->changeImmediateDominator(Exit, OrigPreheader);
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
+      // Everything that was dominated by the old loop header is now dominated
+      // by the original loop preheader. Conceptually the header was merged
+      // into the preheader, even though we reuse the actual block as a new
+      // loop latch.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
+        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+
+      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
 
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
@@ -440,6 +448,35 @@ bool LoopRotate::rotateLoop(Loop *L) {
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(NewHeader, OrigPreheader);
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
+
+      // Brute force incremental dominator tree update. Call
+      // findNearestCommonDominator on all CFG predecessors of each child of the
+      // original header.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      bool Changed;
+      do {
+        Changed = false;
+        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
+          DomTreeNode *Node = HeaderChildren[I];
+          BasicBlock *BB = Node->getBlock();
+
+          pred_iterator PI = pred_begin(BB);
+          BasicBlock *NearestDom = *PI;
+          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
+            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+
+          // Remember if this changes the DomTree.
+          if (Node->getIDom()->getBlock() != NearestDom) {
+            DT->changeImmediateDominator(BB, NearestDom);
+            Changed = true;
+          }
+        }
+
+      // If the dominator changed, this may have an effect on other
+      // predecessors, continue until we reach a fixpoint.
+      } while (Changed);
     }
   }
 
@@ -452,6 +489,8 @@ bool LoopRotate::rotateLoop(Loop *L) {
   // emitted code isn't too gross in this common case.
   MergeBlockIntoPredecessor(OrigHeader, this);
 
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
   ++NumRotated;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 0ae7a51..d7495da 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -121,9 +121,11 @@ void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
+#ifndef NDEBUG
 void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -414,9 +416,11 @@ void Formula::print(raw_ostream &OS) const {
   }
 }
 
+#ifndef NDEBUG
 void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 /// without changing its value.
@@ -974,9 +978,11 @@ void Cost::print(raw_ostream &OS) const {
     OS << ", plus " << SetupCost << " setup cost";
 }
 
+#ifndef NDEBUG
 void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -1060,9 +1066,11 @@ void LSRFixup::print(raw_ostream &OS) const {
     OS << ", Offset=" << Offset;
 }
 
+#ifndef NDEBUG
 void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -1252,9 +1260,11 @@ void LSRUse::print(raw_ostream &OS) const {
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
+#ifndef NDEBUG
 void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
 /// be completely folded into the user instruction at isel time. This includes
@@ -3436,9 +3446,11 @@ void WorkItem::print(raw_ostream &OS) const {
      << " , add offset " << Imm;
 }
 
+#ifndef NDEBUG
 void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
 /// distance apart and try to form reuse opportunities between them.
@@ -4731,9 +4743,11 @@ void LSRInstance::print(raw_ostream &OS) const {
   print_uses(OS);
 }
 
+#ifndef NDEBUG
 void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index 3222f20..dce8e8b 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -1236,16 +1236,19 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
 
   // An ObjC-Identified object can't alias a load if it is never locally stored.
   if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return isStoredObjCPointer(A);
     if (BIsIdentified) {
-      // If both pointers have provenance, they can be directly compared.
-      if (A != B)
-        return false;
-    } else {
-      if (isa<LoadInst>(B))
-        return isStoredObjCPointer(A);
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return isStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
     }
-  } else {
-    if (BIsIdentified && isa<LoadInst>(A))
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
       return isStoredObjCPointer(B);
   }
 
@@ -1381,9 +1384,6 @@ namespace {
   /// PtrState - This class summarizes several per-pointer runtime properties
   /// which are propogated through the flow graph.
   class PtrState {
-    /// NestCount - The known minimum level of retain+release nesting.
-    unsigned NestCount;
-
     /// KnownPositiveRefCount - True if the reference count is known to
     /// be incremented.
     bool KnownPositiveRefCount;
@@ -1401,7 +1401,7 @@ namespace {
     /// TODO: Encapsulate this better.
     RRInfo RRI;
 
-    PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false),
+    PtrState() : KnownPositiveRefCount(false), Partial(false),
                  Seq(S_None) {}
 
     void SetKnownPositiveRefCount() {
@@ -1416,18 +1416,6 @@ namespace {
       return KnownPositiveRefCount;
     }
 
-    void IncrementNestCount() {
-      if (NestCount != UINT_MAX) ++NestCount;
-    }
-
-    void DecrementNestCount() {
-      if (NestCount != 0) --NestCount;
-    }
-
-    bool IsKnownNested() const {
-      return NestCount > 0;
-    }
-
     void SetSeq(Sequence NewSeq) {
       Seq = NewSeq;
     }
@@ -1454,7 +1442,6 @@ void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
   KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
-  NestCount = std::min(NestCount, Other.NestCount);
 
   // We can't merge a plain objc_retain with an objc_retainBlock.
   if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
@@ -1868,6 +1855,26 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   return AutoreleaseCallee;
 }
 
+/// IsPotentialUse - Test whether the given value is possible a
+/// reference-counted pointer, including tests which utilize AliasAnalysis.
+static bool IsPotentialUse(const Value *Op, AliasAnalysis &AA) {
+  // First make the rudimentary check.
+  if (!IsPotentialUse(Op))
+    return false;
+
+  // Objects in constant memory are not reference-counted.
+  if (AA.pointsToConstantMemory(Op))
+    return false;
+
+  // Pointers in constant memory are not pointing to reference-counted objects.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
+    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
+      return false;
+
+  // Otherwise assume the worst.
+  return true;
+}
+
 /// CanAlterRefCount - Test whether the given instruction can result in a
 /// reference count modification (positive or negative) for the pointer's
 /// object.
@@ -1894,7 +1901,7 @@ CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
     for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
          I != E; ++I) {
       const Value *Op = *I;
-      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
@@ -1919,14 +1926,14 @@ CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
     // Comparing a pointer with null, or any other constant, isn't really a use,
     // because we don't care what the pointer points to, or about the values
     // of any other dynamic reference-counted pointers.
-    if (!IsPotentialUse(ICI->getOperand(1)))
+    if (!IsPotentialUse(ICI->getOperand(1), *PA.getAA()))
       return false;
   } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
     // For calls, just check the arguments (and not the callee operand).
     for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
          OE = CS.arg_end(); OI != OE; ++OI) {
       const Value *Op = *OI;
-      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
@@ -1936,14 +1943,14 @@ CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
     const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
     // If we can't tell what the underlying object was, assume there is a
     // dependence.
-    return IsPotentialUse(Op) && PA.related(Op, Ptr);
+    return IsPotentialUse(Op, *PA.getAA()) && PA.related(Op, Ptr);
   }
 
   // Check each operand for a match.
   for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
        OI != OE; ++OI) {
     const Value *Op = *OI;
-    if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+    if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
       return true;
   }
   return false;
@@ -2612,11 +2619,11 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
     S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
     S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
+    S.RRI.KnownSafe = S.IsKnownIncremented();
     S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
     S.RRI.Calls.insert(Inst);
 
-    S.IncrementNestCount();
+    S.SetKnownPositiveRefCount();
     break;
   }
   case IC_RetainBlock:
@@ -2631,7 +2638,6 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
 
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
     S.SetKnownPositiveRefCount();
-    S.DecrementNestCount();
 
     switch (S.GetSeq()) {
     case S_Stop:
@@ -2747,8 +2753,9 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
   // Merge the states from each successor to compute the initial state
   // for the current block.
-  for (BBState::edge_iterator SI(MyStates.succ_begin()),
-       SE(MyStates.succ_end()); SI != SE; ++SI) {
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
     const BasicBlock *Succ = *SI;
     DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
     assert(I != BBStates.end());
@@ -2760,7 +2767,6 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       assert(I != BBStates.end());
       MyStates.MergeSucc(I->second);
     }
-    break;
   }
 
   // Visit all the instructions, bottom-up.
@@ -2823,12 +2829,11 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
       S.ResetSequenceProgress(S_Retain);
       S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      // Don't check S.IsKnownIncremented() here because it's not sufficient.
-      S.RRI.KnownSafe = S.IsKnownNested();
+      S.RRI.KnownSafe = S.IsKnownIncremented();
       S.RRI.Calls.insert(Inst);
     }
 
-    S.IncrementNestCount();
+    S.SetKnownPositiveRefCount();
 
     // A retain can be a potential use; procede to the generic checking
     // code below.
@@ -2838,7 +2843,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.DecrementNestCount();
+    S.ClearRefCount();
 
     switch (S.GetSeq()) {
     case S_Retain:
@@ -2935,8 +2940,9 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
 
   // Merge the states from each predecessor to compute the initial state
   // for the current block.
-  for (BBState::edge_iterator PI(MyStates.pred_begin()),
-       PE(MyStates.pred_end()); PI != PE; ++PI) {
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
     const BasicBlock *Pred = *PI;
     DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
     assert(I != BBStates.end());
@@ -2948,7 +2954,6 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
       assert(I != BBStates.end());
       MyStates.MergePred(I->second);
     }
-    break;
   }
 
   // Visit all the instructions, top-down.
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index d13e4ab..6d27db1 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -59,9 +59,9 @@ FunctionPass *llvm::createCFGSimplificationPass() {
   return new CFGSimplifyPass();
 }
 
-/// ChangeToUnreachable - Insert an unreachable instruction before the specified
+/// changeToUnreachable - Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
-static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   BasicBlock *BB = I->getParent();
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
@@ -87,8 +87,8 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   }
 }
 
-/// ChangeToCall - Convert the specified invoke into a normal call.
-static void ChangeToCall(InvokeInst *II) {
+/// changeToCall - Convert the specified invoke into a normal call.
+static void changeToCall(InvokeInst *II) {
   SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
   NewCall->takeName(II);
@@ -105,7 +105,7 @@ static void ChangeToCall(InvokeInst *II) {
   II->eraseFromParent();
 }
 
-static bool MarkAliveBlocks(BasicBlock *BB,
+static bool markAliveBlocks(BasicBlock *BB,
                             SmallPtrSet<BasicBlock*, 128> &Reachable) {
 
   SmallVector<BasicBlock*, 128> Worklist;
@@ -129,7 +129,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
           ++BBI;
           if (!isa<UnreachableInst>(BBI)) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            ChangeToUnreachable(BBI, false);
+            changeToUnreachable(BBI, false);
             Changed = true;
           }
           break;
@@ -148,7 +148,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
         if (isa<UndefValue>(Ptr) ||
             (isa<ConstantPointerNull>(Ptr) &&
              SI->getPointerAddressSpace() == 0)) {
-          ChangeToUnreachable(SI, true);
+          changeToUnreachable(SI, true);
           Changed = true;
           break;
         }
@@ -159,7 +159,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
       Value *Callee = II->getCalledValue();
       if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        ChangeToUnreachable(II, true);
+        changeToUnreachable(II, true);
         Changed = true;
       } else if (II->doesNotThrow()) {
         if (II->use_empty() && II->onlyReadsMemory()) {
@@ -168,7 +168,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
           II->getUnwindDest()->removePredecessor(II->getParent());
           II->eraseFromParent();
         } else
-          ChangeToCall(II);
+          changeToCall(II);
         Changed = true;
       }
     }
@@ -180,12 +180,12 @@ static bool MarkAliveBlocks(BasicBlock *BB,
   return Changed;
 }
 
-/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise.
-static bool RemoveUnreachableBlocksFromFn(Function &F) {
+static bool removeUnreachableBlocksFromFn(Function &F) {
   SmallPtrSet<BasicBlock*, 128> Reachable;
-  bool Changed = MarkAliveBlocks(F.begin(), Reachable);
+  bool Changed = markAliveBlocks(F.begin(), Reachable);
 
   // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
@@ -215,9 +215,9 @@ static bool RemoveUnreachableBlocksFromFn(Function &F) {
   return true;
 }
 
-/// MergeEmptyReturnBlocks - If we have more than one empty (other than phi
+/// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
 /// node) return blocks, merge them together to promote recursive block merging.
-static bool MergeEmptyReturnBlocks(Function &F) {
+static bool mergeEmptyReturnBlocks(Function &F) {
   bool Changed = false;
 
   BasicBlock *RetBlock = 0;
@@ -291,9 +291,9 @@ static bool MergeEmptyReturnBlocks(Function &F) {
   return Changed;
 }
 
-/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function,
+/// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
-static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
+static bool iterativelySimplifyCFG(Function &F, const TargetData *TD) {
   bool Changed = false;
   bool LocalChange = true;
   while (LocalChange) {
@@ -317,24 +317,24 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
 //
 bool CFGSimplifyPass::runOnFunction(Function &F) {
   const TargetData *TD = getAnalysisIfAvailable<TargetData>();
-  bool EverChanged = RemoveUnreachableBlocksFromFn(F);
-  EverChanged |= MergeEmptyReturnBlocks(F);
-  EverChanged |= IterativeSimplifyCFG(F, TD);
+  bool EverChanged = removeUnreachableBlocksFromFn(F);
+  EverChanged |= mergeEmptyReturnBlocks(F);
+  EverChanged |= iterativelySimplifyCFG(F, TD);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
 
-  // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
-  // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // removeUnreachableBlocksFromFn is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
-  // avoid reruning IterativeSimplifyCFG if the second pass of
-  // RemoveUnreachableBlocksFromFn doesn't do anything.
-  if (!RemoveUnreachableBlocksFromFn(F))
+  // avoid reruning iterativelySimplifyCFG if the second pass of
+  // removeUnreachableBlocksFromFn doesn't do anything.
+  if (!removeUnreachableBlocksFromFn(F))
     return true;
 
   do {
-    EverChanged = IterativeSimplifyCFG(F, TD);
-    EverChanged |= RemoveUnreachableBlocksFromFn(F);
+    EverChanged = iterativelySimplifyCFG(F, TD);
+    EverChanged |= removeUnreachableBlocksFromFn(F);
   } while (EverChanged);
 
   return true;
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 3904419..65311fe 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
@@ -38,6 +39,10 @@ using namespace llvm;
 STATISTIC(NumSimplified, "Number of library calls simplified");
 STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
+static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("Enable unsafe double to float "
+                                            "shrinking for math lib calls"));
 //===----------------------------------------------------------------------===//
 // Optimizer Base Class
 //===----------------------------------------------------------------------===//
@@ -893,16 +898,56 @@ struct MemSetOpt : public LibCallOptimization {
 //===----------------------------------------------------------------------===//
 
 //===---------------------------------------===//
-// 'cos*' Optimizations
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+
+struct UnaryDoubleFPOpt : public LibCallOptimization {
+  bool CheckRetType;
+  UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
+        !FT->getParamType(0)->isDoubleTy())
+      return 0;
+
+    if (CheckRetType) {
+      // Check if all the uses for function like 'sin' are converted to float.
+      for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end();
+          ++UseI) {
+        FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI);
+        if (Cast == 0 || !Cast->getType()->isFloatTy())
+          return 0;
+      }
+    }
+
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
+    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
+      return 0;
+
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+    return B.CreateFPExt(V, B.getDoubleTy());
+  }
+};
 
+//===---------------------------------------===//
+// 'cos*' Optimizations
 struct CosOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "cos" &&
+        TLI->has(LibFunc::cosf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     // cos(-x) -> cos(x)
     Value *Op1 = CI->getArgOperand(0);
@@ -910,7 +955,7 @@ struct CosOpt : public LibCallOptimization {
       BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
       return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
     }
-    return 0;
+    return Ret;
   }
 };
 
@@ -919,13 +964,20 @@ struct CosOpt : public LibCallOptimization {
 
 struct PowOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "pow" &&
+        TLI->has(LibFunc::powf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
     if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
@@ -936,7 +988,7 @@ struct PowOpt : public LibCallOptimization {
     }
 
     ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
-    if (Op2C == 0) return 0;
+    if (Op2C == 0) return Ret;
 
     if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
       return ConstantFP::get(CI->getType(), 1.0);
@@ -974,12 +1026,19 @@ struct PowOpt : public LibCallOptimization {
 
 struct Exp2Opt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "exp2" &&
+        TLI->has(LibFunc::exp2)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     Value *Op = CI->getArgOperand(0);
     // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
@@ -1016,29 +1075,7 @@ struct Exp2Opt : public LibCallOptimization {
 
       return CI;
     }
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
-
-struct UnaryDoubleFPOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
-        !FT->getParamType(0)->isDoubleTy())
-      return 0;
-
-    // If this is something like 'floor((double)floatval)', convert to floorf.
-    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
-      return 0;
-
-    // floor((double)floatval) -> (double)floorf(floatval)
-    Value *V = Cast->getOperand(0);
-    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
-    return B.CreateFPExt(V, B.getDoubleTy());
+    return Ret;
   }
 };
 
@@ -1534,7 +1571,8 @@ namespace {
     StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr;
     MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
-    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
+    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
+    UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP;
     // Integer Optimizations
     FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii;
     ToAsciiOpt ToAscii;
@@ -1547,10 +1585,13 @@ namespace {
   public:
     static char ID; // Pass identification
     SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true),
-                         StpCpy(false), StpCpyChk(true) {
+                         StpCpy(false), StpCpyChk(true),
+                         UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
     void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
+    void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt);
+
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
@@ -1586,6 +1627,12 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) {
     Optimizations[TLI->getName(F)] = Opt;
 }
 
+void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
+                              LibCallOptimization* Opt) {
+  if (TLI->has(F1) && TLI->has(F2))
+    Optimizations[TLI->getName(F1)] = Opt;
+}
+
 /// Optimizations - Populate the Optimizations map with all the optimizations
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
@@ -1641,20 +1688,37 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["llvm.exp2.f64"] = &Exp2;
   Optimizations["llvm.exp2.f32"] = &Exp2;
 
-  if (TLI->has(LibFunc::fabs) && TLI->has(LibFunc::fabsf))
-    Optimizations["fabs"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::floor) && TLI->has(LibFunc::floorf))
-    Optimizations["floor"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::ceil) && TLI->has(LibFunc::ceilf))
-    Optimizations["ceil"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::round) && TLI->has(LibFunc::roundf))
-    Optimizations["round"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::rint) && TLI->has(LibFunc::rintf))
-    Optimizations["rint"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::nearbyint) && TLI->has(LibFunc::nearbyintf))
-    Optimizations["nearbyint"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::trunc) && TLI->has(LibFunc::truncf))
-    Optimizations["trunc"] = &UnaryDoubleFP;
+  AddOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP);
+  AddOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP);
+  AddOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP);
+  AddOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP);
+  AddOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP);
+  AddOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP);
+  AddOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP);
+
+  if(UnsafeFPShrink) {
+    AddOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP);
+  }
 
   // Integer Optimizations
   Optimizations["ffs"] = &FFS;
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index d831452..1e6586b 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -55,10 +55,12 @@ void ExtAddrMode::print(raw_ostream &OS) const {
   OS << ']';
 }
 
+#ifndef NDEBUG
 void ExtAddrMode::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 
 /// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 2679b93..75a7817 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -94,7 +94,7 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) {
 /// is dead. Also recursively delete any operands that become dead as
 /// a result. This includes tracing the def-use list from the PHI to see if
 /// it is ultimately unused or if it reaches an unused cycle.
-bool llvm::DeleteDeadPHIs(BasicBlock *BB) {
+bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   // Recursively deleting a PHI may cause multiple PHIs to be deleted
   // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete.
   SmallVector<WeakVH, 8> PHIs;
@@ -105,7 +105,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB) {
   bool Changed = false;
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
     if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
-      Changed |= RecursivelyDeleteDeadPHINode(PN);
+      Changed |= RecursivelyDeleteDeadPHINode(PN, TLI);
 
   return Changed;
 }
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
new file mode 100644
index 0000000..30d60be
--- /dev/null
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -0,0 +1,253 @@
+//===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "bypass-slow-division"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
+using namespace llvm;
+
+namespace {
+  struct DivOpInfo {
+    bool SignedOp;
+    Value *Dividend;
+    Value *Divisor;
+
+    DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
+      : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
+  };
+
+  struct DivPhiNodes {
+    PHINode *Quotient;
+    PHINode *Remainder;
+
+    DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
+      : Quotient(InQuotient), Remainder(InRemainder) {}
+  };
+}
+
+namespace llvm {
+  template<>
+  struct DenseMapInfo<DivOpInfo> {
+    static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
+      return Val1.SignedOp == Val2.SignedOp &&
+             Val1.Dividend == Val2.Dividend &&
+             Val1.Divisor == Val2.Divisor;
+    }
+
+    static DivOpInfo getEmptyKey() {
+      return DivOpInfo(false, 0, 0);
+    }
+
+    static DivOpInfo getTombstoneKey() {
+      return DivOpInfo(true, 0, 0);
+    }
+
+    static unsigned getHashValue(const DivOpInfo &Val) {
+      return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
+                        reinterpret_cast<uintptr_t>(Val.Divisor)) ^
+                        (unsigned)Val.SignedOp;
+    }
+  };
+
+  typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
+}
+
+// insertFastDiv - Substitutes the div/rem instruction with code that checks the
+// value of the operands and uses a shorter-faster div/rem instruction when
+// possible and the longer-slower div/rem instruction otherwise.
+static bool insertFastDiv(Function &F,
+                          Function::iterator &I,
+                          BasicBlock::iterator &J,
+                          IntegerType *BypassType,
+                          bool UseDivOp,
+                          bool UseSignedOp,
+                          DivCacheTy &PerBBDivCache) {
+  // Get instruction operands
+  Instruction *Instr = J;
+  Value *Dividend = Instr->getOperand(0);
+  Value *Divisor = Instr->getOperand(1);
+
+  if (isa<ConstantInt>(Divisor) ||
+      (isa<ConstantInt>(Dividend) && isa<ConstantInt>(Divisor))) {
+    // Operations with immediate values should have
+    // been solved and replaced during compile time.
+    return false;
+  }
+
+  // Basic Block is split before divide
+  BasicBlock *MainBB = I;
+  BasicBlock *SuccessorBB = I->splitBasicBlock(J);
+  ++I; //advance iterator I to successorBB
+
+  // Add new basic block for slow divide operation
+  BasicBlock *SlowBB = BasicBlock::Create(F.getContext(), "",
+                                          MainBB->getParent(), SuccessorBB);
+  SlowBB->moveBefore(SuccessorBB);
+  IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
+  Value *SlowQuotientV;
+  Value *SlowRemainderV;
+  if (UseSignedOp) {
+    SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
+    SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
+  } else {
+    SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
+    SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
+  }
+  SlowBuilder.CreateBr(SuccessorBB);
+
+  // Add new basic block for fast divide operation
+  BasicBlock *FastBB = BasicBlock::Create(F.getContext(), "",
+                                          MainBB->getParent(), SuccessorBB);
+  FastBB->moveBefore(SlowBB);
+  IRBuilder<> FastBuilder(FastBB, FastBB->begin());
+  Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
+                                                BypassType);
+  Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend,
+                                                 BypassType);
+
+  // udiv/urem because optimization only handles positive numbers
+  Value *ShortQuotientV = FastBuilder.CreateExactUDiv(ShortDividendV,
+                                                      ShortDivisorV);
+  Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
+                                                  ShortDivisorV);
+  Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
+                                                ShortQuotientV,
+                                                Dividend->getType());
+  Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
+                                                 ShortRemainderV,
+                                                 Dividend->getType());
+  FastBuilder.CreateBr(SuccessorBB);
+
+  // Phi nodes for result of div and rem
+  IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
+  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  QuoPhi->addIncoming(SlowQuotientV, SlowBB);
+  QuoPhi->addIncoming(FastQuotientV, FastBB);
+  PHINode *RemPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  RemPhi->addIncoming(SlowRemainderV, SlowBB);
+  RemPhi->addIncoming(FastRemainderV, FastBB);
+
+  // Replace Instr with appropriate phi node
+  if (UseDivOp)
+    Instr->replaceAllUsesWith(QuoPhi);
+  else
+    Instr->replaceAllUsesWith(RemPhi);
+  Instr->eraseFromParent();
+
+  // Combine operands into a single value with OR for value testing below
+  MainBB->getInstList().back().eraseFromParent();
+  IRBuilder<> MainBuilder(MainBB, MainBB->end());
+  Value *OrV = MainBuilder.CreateOr(Dividend, Divisor);
+
+  // BitMask is inverted to check if the operands are
+  // larger than the bypass type
+  uint64_t BitMask = ~BypassType->getBitMask();
+  Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
+
+  // Compare operand values and branch
+  Value *ZeroV = MainBuilder.getInt32(0);
+  Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
+  MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
+
+  // point iterator J at first instruction of successorBB
+  J = I->begin();
+
+  // Cache phi nodes to be used later in place of other instances
+  // of div or rem with the same sign, dividend, and divisor
+  DivOpInfo Key(UseSignedOp, Dividend, Divisor);
+  DivPhiNodes Value(QuoPhi, RemPhi);
+  PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
+  return true;
+}
+
+// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder if
+// operands and operation are identical. Otherwise call insertFastDiv to perform
+// the optimization and cache the resulting dividend and remainder.
+static bool reuseOrInsertFastDiv(Function &F,
+                                 Function::iterator &I,
+                                 BasicBlock::iterator &J,
+                                 IntegerType *BypassType,
+                                 bool UseDivOp,
+                                 bool UseSignedOp,
+                                 DivCacheTy &PerBBDivCache) {
+  // Get instruction operands
+  Instruction *Instr = J;
+  DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1));
+  DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
+
+  if (CacheI == PerBBDivCache.end()) {
+    // If previous instance does not exist, insert fast div
+    return insertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp,
+                         PerBBDivCache);
+  }
+
+  // Replace operation value with previously generated phi node
+  DivPhiNodes &Value = CacheI->second;
+  if (UseDivOp) {
+    // Replace all uses of div instruction with quotient phi node
+    J->replaceAllUsesWith(Value.Quotient);
+  } else {
+    // Replace all uses of rem instruction with remainder phi node
+    J->replaceAllUsesWith(Value.Remainder);
+  }
+
+  // Advance to next operation
+  ++J;
+
+  // Remove redundant operation
+  Instr->eraseFromParent();
+  return true;
+}
+
+// bypassSlowDivision - This optimization identifies DIV instructions that can
+// be profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(Function &F,
+                              Function::iterator &I,
+                              const DenseMap<Type *, Type *> &BypassTypeMap) {
+  DivCacheTy DivCache;
+
+  bool MadeChange = false;
+  for (BasicBlock::iterator J = I->begin(); J != I->end(); J++) {
+
+    // Get instruction details
+    unsigned Opcode = J->getOpcode();
+    bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
+    bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
+    bool UseSignedOp = Opcode == Instruction::SDiv ||
+                       Opcode == Instruction::SRem;
+
+    // Only optimize div or rem ops
+    if (!UseDivOp && !UseRemOp)
+      continue;
+
+    // Continue if div/rem type is not bypassed
+    DenseMap<Type *, Type *>::const_iterator BT =
+      BypassTypeMap.find(J->getType());
+    if (BT == BypassTypeMap.end())
+      continue;
+
+    IntegerType *BypassType = cast<IntegerType>(BT->second);
+    MadeChange |= reuseOrInsertFastDiv(F, I, J, BypassType, UseDivOp,
+                                       UseSignedOp, DivCache);
+  }
+
+  return MadeChange;
+}
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 4ff31ca..215a16f 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMTransformUtils
   BasicBlockUtils.cpp
   BreakCriticalEdges.cpp
   BuildLibCalls.cpp
+  BypassSlowDivision.cpp
   CloneFunction.cpp
   CloneModule.cpp
   CmpInstAnalysis.cpp
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index bed7d72..0601433 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -52,7 +52,8 @@ using namespace llvm;
 /// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
 /// conditions and indirectbr addresses this might make dead if
 /// DeleteDeadConditions is true.
-bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
+bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
+                                  const TargetLibraryInfo *TLI) {
   TerminatorInst *T = BB->getTerminator();
   IRBuilder<> Builder(T);
 
@@ -96,7 +97,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       Value *Cond = BI->getCondition();
       BI->eraseFromParent();
       if (DeleteDeadConditions)
-        RecursivelyDeleteTriviallyDeadInstructions(Cond);
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
       return true;
     }
     return false;
@@ -161,7 +162,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       Value *Cond = SI->getCondition();
       SI->eraseFromParent();
       if (DeleteDeadConditions)
-        RecursivelyDeleteTriviallyDeadInstructions(Cond);
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
       return true;
     }
     
@@ -205,7 +206,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       Value *Address = IBI->getAddress();
       IBI->eraseFromParent();
       if (DeleteDeadConditions)
-        RecursivelyDeleteTriviallyDeadInstructions(Address);
+        RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
       
       // If we didn't find our destination in the IBI successor list, then we
       // have undefined behavior.  Replace the unconditional branch with an
@@ -230,7 +231,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
 /// isInstructionTriviallyDead - Return true if the result produced by the
 /// instruction is not used, and the instruction has no side effects.
 ///
-bool llvm::isInstructionTriviallyDead(Instruction *I) {
+bool llvm::isInstructionTriviallyDead(Instruction *I,
+                                      const TargetLibraryInfo *TLI) {
   if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
 
   // We don't want the landingpad instruction removed by anything this general.
@@ -265,9 +267,9 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
       return isa<UndefValue>(II->getArgOperand(1));
   }
 
-  if (isAllocLikeFn(I)) return true;
+  if (isAllocLikeFn(I, TLI)) return true;
 
-  if (CallInst *CI = isFreeCall(I))
+  if (CallInst *CI = isFreeCall(I, TLI))
     if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
       return C->isNullValue() || isa<UndefValue>(C);
 
@@ -278,9 +280,11 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
 /// trivially dead instruction, delete it.  If that makes any of its operands
 /// trivially dead, delete them too, recursively.  Return true if any
 /// instructions were deleted.
-bool llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V) {
+bool
+llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
+                                                 const TargetLibraryInfo *TLI) {
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I))
+  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI))
     return false;
   
   SmallVector<Instruction*, 16> DeadInsts;
@@ -301,7 +305,7 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V) {
       // operand, and if it is 'trivially' dead, delete it in a future loop
       // iteration.
       if (Instruction *OpI = dyn_cast<Instruction>(OpV))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           DeadInsts.push_back(OpI);
     }
     
@@ -334,19 +338,20 @@ static bool areAllUsesEqual(Instruction *I) {
 /// either forms a cycle or is terminated by a trivially dead instruction,
 /// delete it.  If that makes any of its operands trivially dead, delete them
 /// too, recursively.  Return true if a change was made.
-bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
+bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
+                                        const TargetLibraryInfo *TLI) {
   SmallPtrSet<Instruction*, 4> Visited;
   for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects();
        I = cast<Instruction>(*I->use_begin())) {
     if (I->use_empty())
-      return RecursivelyDeleteTriviallyDeadInstructions(I);
+      return RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
 
     // If we find an instruction more than once, we're on a cycle that
     // won't prove fruitful.
     if (!Visited.insert(I)) {
       // Break the cycle and delete the instruction and its operands.
       I->replaceAllUsesWith(UndefValue::get(I->getType()));
-      (void)RecursivelyDeleteTriviallyDeadInstructions(I);
+      (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
       return true;
     }
   }
@@ -358,7 +363,8 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
 ///
 /// This returns true if it changed the code, note that it can delete
 /// instructions in other blocks as well in this block.
-bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) {
+bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD,
+                                       const TargetLibraryInfo *TLI) {
   bool MadeChange = false;
 
 #ifndef NDEBUG
@@ -381,7 +387,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) {
       continue;
     }
 
-    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst);
+    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
     if (BIHandle != BI)
       BI = BB->begin();
   }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 518df7c..32d7fa1 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -22,6 +22,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
+#include "llvm/Module.h"
 #include "llvm/Operator.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
@@ -54,6 +55,7 @@ DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
        cl::desc("Duplicate return instructions into unconditional branches"));
 
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
+STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
 
 namespace {
   /// ValueEqualityComparisonCase - Represents a case of a switch.
@@ -101,14 +103,14 @@ public:
 ///
 static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
   if (SI1 == SI2) return false;  // Can't merge with self!
-  
+
   // It is not safe to merge these two switch instructions if they have a common
   // successor, and if that successor has a PHI node, and if *that* PHI node has
   // conflicting incoming values from the two switch blocks.
   BasicBlock *SI1BB = SI1->getParent();
   BasicBlock *SI2BB = SI2->getParent();
   SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
-  
+
   for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
     if (SI1Succs.count(*I))
       for (BasicBlock::iterator BBI = (*I)->begin();
@@ -118,7 +120,7 @@ static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
             PN->getIncomingValueForBlock(SI2BB))
           return false;
       }
-        
+
   return true;
 }
 
@@ -135,7 +137,7 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
   assert(SI1->isUnconditional() && SI2->isConditional());
 
   // We fold the unconditional branch if we can easily update all PHI nodes in
-  // common successors: 
+  // common successors:
   // 1> We have a constant incoming value for the conditional branch;
   // 2> We have "Cond" as the incoming value for the unconditional branch;
   // 3> SI2->getCondition() and Cond have same operands.
@@ -170,7 +172,7 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
 static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
                                   BasicBlock *ExistPred) {
   if (!isa<PHINode>(Succ->begin())) return; // Quick exit if nothing to do
-  
+
   PHINode *PN;
   for (BasicBlock::iterator I = Succ->begin();
        (PN = dyn_cast<PHINode>(I)); ++I)
@@ -222,7 +224,7 @@ static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
     // doesn't dominate BB.
     if (Pred2->getSinglePredecessor() == 0)
       return 0;
-    
+
     // If we found a conditional branch predecessor, make sure that it branches
     // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
     if (Pred1Br->getSuccessor(0) == BB &&
@@ -252,7 +254,7 @@ static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
   // Otherwise, if this is a conditional branch, then we can use it!
   BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
   if (BI == 0) return 0;
-  
+
   assert(BI->isConditional() && "Two successors but not conditional?");
   if (BI->getSuccessor(0) == Pred1) {
     IfTrue = Pred1;
@@ -345,7 +347,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // If we aren't allowing aggressive promotion anymore, then don't consider
   // instructions in the 'if region'.
   if (AggressiveInsts == 0) return false;
-  
+
   // If we have seen this instruction before, don't count it again.
   if (AggressiveInsts->count(I)) return true;
 
@@ -411,7 +413,7 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
                        const TargetData *TD, bool isEQ, unsigned &UsedICmps) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return 0;
-  
+
   // If this is an icmp against a constant, handle this as one of the cases.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
     if (ConstantInt *C = GetConstantInt(I->getOperand(1), TD)) {
@@ -420,21 +422,21 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
         Vals.push_back(C);
         return I->getOperand(0);
       }
-      
+
       // If we have "x ult 3" comparison, for example, then we can add 0,1,2 to
       // the set.
       ConstantRange Span =
         ConstantRange::makeICmpRegion(ICI->getPredicate(), C->getValue());
-      
+
       // If this is an and/!= check then we want to optimize "x ugt 2" into
       // x != 0 && x != 1.
       if (!isEQ)
         Span = Span.inverse();
-      
+
       // If there are a ton of values, we don't want to make a ginormous switch.
       if (Span.getSetSize().ugt(8) || Span.isEmptySet())
         return 0;
-      
+
       for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
         Vals.push_back(ConstantInt::get(V->getContext(), Tmp));
       UsedICmps++;
@@ -442,11 +444,11 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
     }
     return 0;
   }
-  
+
   // Otherwise, we can only handle an | or &, depending on isEQ.
   if (I->getOpcode() != (isEQ ? Instruction::Or : Instruction::And))
     return 0;
-  
+
   unsigned NumValsBeforeLHS = Vals.size();
   unsigned UsedICmpsBeforeLHS = UsedICmps;
   if (Value *LHS = GatherConstantCompares(I->getOperand(0), Vals, Extra, TD,
@@ -467,12 +469,12 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
       Extra = I->getOperand(1);
       return LHS;
     }
-    
+
     Vals.resize(NumValsBeforeLHS);
     UsedICmps = UsedICmpsBeforeLHS;
     return 0;
   }
-  
+
   // If the LHS can't be folded in, but Extra is available and RHS can, try to
   // use LHS as Extra.
   if (Extra == 0 || Extra == I->getOperand(0)) {
@@ -484,7 +486,7 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
     assert(Vals.size() == NumValsBeforeLHS);
     Extra = OldExtra;
   }
-  
+
   return 0;
 }
 
@@ -615,6 +617,9 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   assert(ThisVal && "This isn't a value comparison!!");
   if (ThisVal != PredVal) return false;  // Different predicates.
 
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
   // Find out information about when control will move from Pred to TI's block.
   std::vector<ValueEqualityComparisonCase> PredCases;
   BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
@@ -634,7 +639,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     // can simplify TI.
     if (!ValuesOverlap(PredCases, ThisCases))
       return false;
-    
+
     if (isa<BranchInst>(TI)) {
       // Okay, one of the successors of this condbr is dead.  Convert it to a
       // uncond br.
@@ -652,7 +657,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       EraseTerminatorInstAndDCECond(TI);
       return true;
     }
-      
+
     SwitchInst *SI = cast<SwitchInst>(TI);
     // Okay, TI has cases that are statically dead, prune them away.
     SmallPtrSet<Constant*, 16> DeadCases;
@@ -673,7 +678,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
   }
-  
+
   // Otherwise, TI's block must correspond to some matched value.  Find out
   // which value (or set of values) this is.
   ConstantInt *TIV = 0;
@@ -729,8 +734,8 @@ namespace {
 }
 
 static int ConstantIntSortPredicate(const void *P1, const void *P2) {
-  const ConstantInt *LHS = *(const ConstantInt**)P1;
-  const ConstantInt *RHS = *(const ConstantInt**)P2;
+  const ConstantInt *LHS = *(const ConstantInt*const*)P1;
+  const ConstantInt *RHS = *(const ConstantInt*const*)P2;
   if (LHS->getValue().ult(RHS->getValue()))
     return 1;
   if (LHS->getValue() == RHS->getValue())
@@ -738,6 +743,67 @@ static int ConstantIntSortPredicate(const void *P1, const void *P2) {
   return -1;
 }
 
+static inline bool HasBranchWeights(const Instruction* I) {
+  MDNode* ProfMD = I->getMetadata(LLVMContext::MD_prof);
+  if (ProfMD && ProfMD->getOperand(0))
+    if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
+      return MDS->getString().equals("branch_weights");
+
+  return false;
+}
+
+/// Tries to get a branch weight for the given instruction, returns NULL if it
+/// can't. Pos starts at 0.
+static ConstantInt* GetWeight(Instruction* I, int Pos) {
+  MDNode* ProfMD = I->getMetadata(LLVMContext::MD_prof);
+  if (ProfMD && ProfMD->getOperand(0)) {
+    if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0))) {
+      if (MDS->getString().equals("branch_weights")) {
+        assert(ProfMD->getNumOperands() >= 3);
+        return dyn_cast<ConstantInt>(ProfMD->getOperand(1 + Pos));
+      }
+    }
+  }
+
+  return 0;
+}
+
+/// Scale the given weights based on the successor TI's metadata. Scaling is
+/// done by multiplying every weight by the sum of the successor's weights.
+static void ScaleWeights(Instruction* STI, MutableArrayRef<uint64_t> Weights) {
+  // Sum the successor's weights
+  assert(HasBranchWeights(STI));
+  unsigned Scale = 0;
+  MDNode* ProfMD = STI->getMetadata(LLVMContext::MD_prof);
+  for (unsigned i = 1; i < ProfMD->getNumOperands(); ++i) {
+    ConstantInt* CI = dyn_cast<ConstantInt>(ProfMD->getOperand(i));
+    assert(CI);
+    Scale += CI->getValue().getZExtValue();
+  }
+
+  // Skip default, as it's replaced during the folding
+  for (unsigned i = 1; i < Weights.size(); ++i) {
+    Weights[i] *= Scale;
+  }
+}
+
+/// Sees if any of the weights are too big for a uint32_t, and halves all the
+/// weights if any are.
+static void FitWeights(MutableArrayRef<uint64_t> Weights) {
+  bool Halve = false;
+  for (unsigned i = 0; i < Weights.size(); ++i)
+    if (Weights[i] > UINT_MAX) {
+      Halve = true;
+      break;
+    }
+
+  if (! Halve)
+    return;
+
+  for (unsigned i = 0; i < Weights.size(); ++i)
+    Weights[i] /= 2;
+}
+
 /// FoldValueComparisonIntoPredecessors - The specified terminator is a value
 /// equality comparison instruction (either a switch or a branch on "X == c").
 /// See if any of the predecessors of the terminator block are value comparisons
@@ -770,6 +836,55 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // build.
       SmallVector<BasicBlock*, 8> NewSuccessors;
 
+      // Update the branch weight metadata along the way
+      SmallVector<uint64_t, 8> Weights;
+      uint64_t PredDefaultWeight = 0;
+      bool PredHasWeights = HasBranchWeights(PTI);
+      bool SuccHasWeights = HasBranchWeights(TI);
+
+      if (PredHasWeights) {
+        MDNode* MD = PTI->getMetadata(LLVMContext::MD_prof);
+        assert(MD);
+        for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
+          ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(i));
+          assert(CI);
+          Weights.push_back(CI->getValue().getZExtValue());
+        }
+
+        // If the predecessor is a conditional eq, then swap the default weight
+        // to be the first entry.
+        if (BranchInst* BI = dyn_cast<BranchInst>(PTI)) {
+          assert(Weights.size() == 2);
+          ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+
+          if (ICI->getPredicate() == ICmpInst::ICMP_EQ) {
+            std::swap(Weights.front(), Weights.back());
+          }
+        }
+
+        PredDefaultWeight = Weights.front();
+      } else if (SuccHasWeights) {
+        // If there are no predecessor weights but there are successor weights,
+        // populate Weights with 1, which will later be scaled to the sum of
+        // successor's weights
+        Weights.assign(1 + PredCases.size(), 1);
+        PredDefaultWeight = 1;
+      }
+
+      uint64_t SuccDefaultWeight = 0;
+      if (SuccHasWeights) {
+        int Index = 0;
+        if (BranchInst* BI = dyn_cast<BranchInst>(TI)) {
+          ICmpInst* ICI = dyn_cast<ICmpInst>(BI->getCondition());
+          assert(ICI);
+
+          if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+            Index = 1;
+        }
+
+        SuccDefaultWeight = GetWeight(TI, Index)->getValue().getZExtValue();
+      }
+
       if (PredDefault == BB) {
         // If this is the default destination from PTI, only the edges in TI
         // that don't occur in PTI, or that branch to BB will be activated.
@@ -780,6 +895,12 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           else {
             // The default destination is BB, we don't need explicit targets.
             std::swap(PredCases[i], PredCases.back());
+
+            if (PredHasWeights) {
+              std::swap(Weights[i+1], Weights.back());
+              Weights.pop_back();
+            }
+
             PredCases.pop_back();
             --i; --e;
           }
@@ -790,14 +911,34 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           PredDefault = BBDefault;
           NewSuccessors.push_back(BBDefault);
         }
+
+        if (SuccHasWeights) {
+          ScaleWeights(TI, Weights);
+          Weights.front() *= SuccDefaultWeight;
+        } else if (PredHasWeights) {
+          Weights.front() /= (1 + BBCases.size());
+        }
+
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
           if (!PTIHandled.count(BBCases[i].Value) &&
               BBCases[i].Dest != BBDefault) {
             PredCases.push_back(BBCases[i]);
             NewSuccessors.push_back(BBCases[i].Dest);
+            if (SuccHasWeights) {
+              Weights.push_back(PredDefaultWeight *
+                                GetWeight(TI, i)->getValue().getZExtValue());
+            } else if (PredHasWeights) {
+              // Split the old default's weight amongst the children
+              Weights.push_back(PredDefaultWeight / (1 + BBCases.size()));
+            }
           }
 
       } else {
+        // FIXME: preserve branch weight metadata, similarly to the 'then'
+        // above. For now, drop it.
+        PredHasWeights = false;
+        SuccHasWeights = false;
+
         // If this is not the default destination from PSI, only the edges
         // in SI that occur in PSI with a destination of BB will be
         // activated.
@@ -822,7 +963,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
         // If there are any constants vectored to BB that TI doesn't handle,
         // they must go to the default destination of TI.
-        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = 
+        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I =
                                     PTIHandled.begin(),
                E = PTIHandled.end(); I != E; ++I) {
           PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
@@ -851,6 +992,17 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
         NewSI->addCase(PredCases[i].Value, PredCases[i].Dest);
 
+      if (PredHasWeights || SuccHasWeights) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(Weights);
+
+        SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+
+        NewSI->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(BB->getContext()).
+                           createBranchWeights(MDWeights));
+      }
+
       EraseTerminatorInstAndDCECond(PTI);
 
       // Okay, last check.  If BB is still a successor of PSI, then we must
@@ -984,11 +1136,11 @@ HoistTerminator:
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
       if (BB1V == BB2V) continue;
-      
+
       // These values do not agree.  Insert a select instruction before NT
       // that determines the right value.
       SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
-      if (SI == 0) 
+      if (SI == 0)
         SI = cast<SelectInst>
           (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
                                 BB1V->getName()+"."+BB2V->getName()));
@@ -1056,7 +1208,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
     // Do not hoist the instruction if any of its operands are defined but not
     // used in this BB. The transformation will prevent the operand from
     // being sunk into the use block.
-    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end(); 
+    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end();
          i != e; ++i) {
       Instruction *OpI = dyn_cast<Instruction>(*i);
       if (OpI && OpI->getParent() == BIParent &&
@@ -1112,7 +1264,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
   // as well.
   if (PHIs.empty())
     return false;
-  
+
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *BB1 << "\n";);
 
@@ -1162,13 +1314,13 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
   unsigned Size = 0;
-  
+
   for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
     if (isa<DbgInfoIntrinsic>(BBI))
       continue;
     if (Size > 10) return false;  // Don't clone large BB's.
     ++Size;
-    
+
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.
     for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end();
@@ -1176,7 +1328,7 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
       Instruction *U = cast<Instruction>(*UI);
       if (U->getParent() != BB || isa<PHINode>(U)) return false;
     }
-    
+
     // Looks ok, continue checking.
   }
 
@@ -1194,31 +1346,31 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
   // outside of the block.
   if (!PN || PN->getParent() != BB || !PN->hasOneUse())
     return false;
-  
+
   // Degenerate case of a single entry PHI.
   if (PN->getNumIncomingValues() == 1) {
     FoldSingleEntryPHINodes(PN->getParent());
-    return true;    
+    return true;
   }
 
   // Now we know that this block has multiple preds and two succs.
   if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
-  
+
   // Okay, this is a simple enough basic block.  See if any phi values are
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
     if (CB == 0 || !CB->getType()->isIntegerTy(1)) continue;
-    
+
     // Okay, we now know that all edges from PredBB should be revectored to
     // branch to RealDest.
     BasicBlock *PredBB = PN->getIncomingBlock(i);
     BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
-    
+
     if (RealDest == BB) continue;  // Skip self loops.
     // Skip if the predecessor's terminator is an indirect branch.
     if (isa<IndirectBrInst>(PredBB->getTerminator())) continue;
-    
+
     // The dest block might have PHI nodes, other predecessors and other
     // difficult cases.  Instead of being smart about this, just insert a new
     // block that jumps to the destination block, effectively splitting
@@ -1227,7 +1379,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
                                             RealDest->getName()+".critedge",
                                             RealDest->getParent(), RealDest);
     BranchInst::Create(RealDest, EdgeBB);
-    
+
     // Update PHI nodes.
     AddPredecessorToBlock(RealDest, EdgeBB, BB);
 
@@ -1244,7 +1396,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
       // Clone the instruction.
       Instruction *N = BBI->clone();
       if (BBI->hasName()) N->setName(BBI->getName()+".c");
-      
+
       // Update operands due to translation.
       for (User::op_iterator i = N->op_begin(), e = N->op_end();
            i != e; ++i) {
@@ -1252,7 +1404,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
         if (PI != TranslateMap.end())
           *i = PI->second;
       }
-      
+
       // Check for trivial simplification.
       if (Value *V = SimplifyInstruction(N, TD)) {
         TranslateMap[BBI] = V;
@@ -1297,7 +1449,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
       // Don't bother if the branch will be constant folded trivially.
       isa<ConstantInt>(IfCond))
     return false;
-  
+
   // Okay, we found that we can merge this two-entry phi node into a select.
   // Doing so would require us to fold *all* two entry phi nodes in this block.
   // At some point this becomes non-profitable (particularly if the target
@@ -1307,14 +1459,14 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
   for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
     if (NumPhis > 2)
       return false;
-  
+
   // Loop over the PHI's seeing if we can promote them all to select
   // instructions.  While we are at it, keep track of the instructions
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction*, 4> AggressiveInsts;
   unsigned MaxCostVal0 = PHINodeFoldingThreshold,
            MaxCostVal1 = PHINodeFoldingThreshold;
-  
+
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
     if (Value *V = SimplifyInstruction(PN, TD)) {
@@ -1322,19 +1474,19 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
       PN->eraseFromParent();
       continue;
     }
-    
+
     if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts,
                              MaxCostVal0) ||
         !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts,
                              MaxCostVal1))
       return false;
   }
-  
+
   // If we folded the first phi, PN dangles at this point.  Refresh it.  If
   // we ran out of PHIs then we simplified them all.
   PN = dyn_cast<PHINode>(BB->begin());
   if (PN == 0) return true;
-  
+
   // Don't fold i1 branches on PHIs which contain binary operators.  These can
   // often be turned into switches and other things.
   if (PN->getType()->isIntegerTy(1) &&
@@ -1342,7 +1494,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
        isa<BinaryOperator>(PN->getIncomingValue(1)) ||
        isa<BinaryOperator>(IfCond)))
     return false;
-  
+
   // If we all PHI nodes are promotable, check to make sure that all
   // instructions in the predecessor blocks can be promoted as well.  If
   // not, we won't be able to get rid of the control flow, so it's not
@@ -1362,7 +1514,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
         return false;
       }
   }
-    
+
   if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) {
     IfBlock2 = 0;
   } else {
@@ -1375,15 +1527,15 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
         return false;
       }
   }
-  
+
   DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
                << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
-      
+
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
   Instruction *InsertPt = DomBlock->getTerminator();
   IRBuilder<true, NoFolder> Builder(InsertPt);
-  
+
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
   if (IfBlock1)
@@ -1394,19 +1546,19 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
     DomBlock->getInstList().splice(InsertPt,
                                    IfBlock2->getInstList(), IfBlock2->begin(),
                                    IfBlock2->getTerminator());
-  
+
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     // Change the PHI node into a select instruction.
     Value *TrueVal  = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
     Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
-    
-    SelectInst *NV = 
+
+    SelectInst *NV =
       cast<SelectInst>(Builder.CreateSelect(IfCond, TrueVal, FalseVal, ""));
     PN->replaceAllUsesWith(NV);
     NV->takeName(PN);
     PN->eraseFromParent();
   }
-  
+
   // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
   // has been flattened.  Change DomBlock to jump directly to our new block to
   // avoid other simplifycfg's kicking in on the diamond.
@@ -1420,14 +1572,14 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
 /// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes
 /// to two returning blocks, try to merge them together into one return,
 /// introducing a select if the return values disagree.
-static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, 
+static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
                                            IRBuilder<> &Builder) {
   assert(BI->isConditional() && "Must be a conditional branch");
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
   ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
   ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
-  
+
   // Check to ensure both blocks are empty (just a return) or optionally empty
   // with PHI nodes.  If there are other instructions, merging would cause extra
   // computation on one path or the other.
@@ -1447,12 +1599,12 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
     EraseTerminatorInstAndDCECond(BI);
     return true;
   }
-    
+
   // Otherwise, figure out what the true and false return values are
   // so we can insert a new select instruction.
   Value *TrueValue = TrueRet->getReturnValue();
   Value *FalseValue = FalseRet->getReturnValue();
-  
+
   // Unwrap any PHI nodes in the return blocks.
   if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
     if (TVPN->getParent() == TrueSucc)
@@ -1460,7 +1612,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
   if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
     if (FVPN->getParent() == FalseSucc)
       FalseValue = FVPN->getIncomingValueForBlock(BI->getParent());
-  
+
   // In order for this transformation to be safe, we must be able to
   // unconditionally execute both operands to the return.  This is
   // normally the case, but we could have a potentially-trapping
@@ -1472,12 +1624,12 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
   if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
     if (FCV->canTrap())
       return false;
-  
+
   // Okay, we collected all the mapped values and checked them for sanity, and
   // defined to really do this transformation.  First, update the CFG.
   TrueSucc->removePredecessor(BI->getParent());
   FalseSucc->removePredecessor(BI->getParent());
-  
+
   // Insert select instructions where needed.
   Value *BrCond = BI->getCondition();
   if (TrueValue) {
@@ -1491,15 +1643,15 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
     }
   }
 
-  Value *RI = !TrueValue ? 
+  Value *RI = !TrueValue ?
     Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
 
   (void) RI;
-      
+
   DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
                << "\n  " << *BI << "NewRet = " << *RI
                << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: "<< *FalseSucc);
-      
+
   EraseTerminatorInstAndDCECond(BI);
 
   return true;
@@ -1600,7 +1752,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     if (Cond == 0)
       return false;
   }
-     
+
   if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
     Cond->getParent() != BB || !Cond->hasOneUse())
   return false;
@@ -1623,7 +1775,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       isSafeToSpeculativelyExecute(FrontIt)) {
     BonusInst = &*FrontIt;
     ++FrontIt;
-    
+
     // Ignore dbg intrinsics.
     while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
   }
@@ -1631,13 +1783,13 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   // Only a single bonus inst is allowed.
   if (&*FrontIt != Cond)
     return false;
-  
+
   // Make sure the instruction after the condition is the cond branch.
   BasicBlock::iterator CondIt = Cond; ++CondIt;
 
   // Ingore dbg intrinsics.
   while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt;
-  
+
   if (&*CondIt != BI)
     return false;
 
@@ -1649,7 +1801,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
     if (CE->canTrap())
       return false;
-  
+
   // Finally, don't infinitely unroll conditional loops.
   BasicBlock *TrueDest  = BI->getSuccessor(0);
   BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0;
@@ -1659,22 +1811,22 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *PredBlock = *PI;
     BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
-    
+
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
     SmallVector<PHINode*, 4> PHIs;
     if (PBI == 0 || PBI->isUnconditional() ||
-        (BI->isConditional() && 
+        (BI->isConditional() &&
          !SafeToMergeTerminators(BI, PBI)) ||
         (!BI->isConditional() &&
          !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
       continue;
-    
+
     // Determine if the two branches share a common destination.
     Instruction::BinaryOps Opc;
     bool InvertPredCond = false;
-    
+
     if (BI->isConditional()) {
       if (PBI->getSuccessor(0) == TrueDest)
         Opc = Instruction::Or;
@@ -1693,7 +1845,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
 
     // Ensure that any values used in the bonus instruction are also used
     // by the terminator of the predecessor.  This means that those values
-    // must already have been resolved, so we won't be inhibiting the 
+    // must already have been resolved, so we won't be inhibiting the
     // out-of-order core by speculating them earlier.
     if (BonusInst) {
       // Collect the values used by the bonus inst
@@ -1707,47 +1859,47 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
 
       SmallVector<std::pair<Value*, unsigned>, 4> Worklist;
       Worklist.push_back(std::make_pair(PBI->getOperand(0), 0));
-      
+
       // Walk up to four levels back up the use-def chain of the predecessor's
       // terminator to see if all those values were used.  The choice of four
       // levels is arbitrary, to provide a compile-time-cost bound.
       while (!Worklist.empty()) {
         std::pair<Value*, unsigned> Pair = Worklist.back();
         Worklist.pop_back();
-        
+
         if (Pair.second >= 4) continue;
         UsedValues.erase(Pair.first);
         if (UsedValues.empty()) break;
-        
+
         if (Instruction *I = dyn_cast<Instruction>(Pair.first)) {
           for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
                OI != OE; ++OI)
             Worklist.push_back(std::make_pair(OI->get(), Pair.second+1));
-        }       
+        }
       }
-      
+
       if (!UsedValues.empty()) return false;
     }
 
     DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
-    IRBuilder<> Builder(PBI);    
+    IRBuilder<> Builder(PBI);
 
     // If we need to invert the condition in the pred block to match, do so now.
     if (InvertPredCond) {
       Value *NewCond = PBI->getCondition();
-      
+
       if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) {
         CmpInst *CI = cast<CmpInst>(NewCond);
         CI->setPredicate(CI->getInversePredicate());
       } else {
-        NewCond = Builder.CreateNot(NewCond, 
+        NewCond = Builder.CreateNot(NewCond,
                                     PBI->getCondition()->getName()+".not");
       }
-      
+
       PBI->setCondition(NewCond);
       PBI->swapSuccessors();
     }
-    
+
     // If we have a bonus inst, clone it into the predecessor block.
     Instruction *NewBonus = 0;
     if (BonusInst) {
@@ -1756,7 +1908,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       NewBonus->takeName(BonusInst);
       BonusInst->setName(BonusInst->getName()+".old");
     }
-    
+
     // Clone Cond into the predecessor basic block, and or/and the
     // two conditions together.
     Instruction *New = Cond->clone();
@@ -1764,9 +1916,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     PredBlock->getInstList().insert(PBI, New);
     New->takeName(Cond);
     Cond->setName(New->getName()+".old");
-    
+
     if (BI->isConditional()) {
-      Instruction *NewCond = 
+      Instruction *NewCond =
         cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
                                             New, "or.cond"));
       PBI->setCondition(NewCond);
@@ -1806,7 +1958,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
           // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
           // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
           //       is false: PBI_Cond and BI_Value
-          MergedCond = 
+          MergedCond =
             cast<Instruction>(Builder.CreateBinOp(Instruction::And,
                                 PBI->getCondition(), New,
                                 "and.cond"));
@@ -1814,7 +1966,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
             Instruction *NotCond =
               cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
                                   "not.cond"));
-            MergedCond = 
+            MergedCond =
               cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
                                   NotCond, MergedCond,
                                   "or.cond"));
@@ -1921,7 +2073,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
       if (isa<DbgInfoIntrinsic>(*I))
         I->clone()->insertBefore(PBI);
-      
+
     return true;
   }
   return false;
@@ -1936,7 +2088,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   BasicBlock *BB = BI->getParent();
 
   // If this block ends with a branch instruction, and if there is a
-  // predecessor that ends on a branch of the same condition, make 
+  // predecessor that ends on a branch of the same condition, make
   // this conditional branch redundant.
   if (PBI->getCondition() == BI->getCondition() &&
       PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
@@ -1945,11 +2097,11 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
     if (BB->getSinglePredecessor()) {
       // Turn this into a branch on constant.
       bool CondIsTrue = PBI->getSuccessor(0) == BB;
-      BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()), 
+      BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
                                         CondIsTrue));
       return true;  // Nuke the branch on constant.
     }
-    
+
     // Otherwise, if there are multiple predecessors, insert a PHI that merges
     // in the constant and simplify the block result.  Subsequent passes of
     // simplifycfg will thread the block.
@@ -1969,18 +2121,18 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
             PBI->getCondition() == BI->getCondition() &&
             PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
           bool CondIsTrue = PBI->getSuccessor(0) == BB;
-          NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()), 
+          NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
                                               CondIsTrue), P);
         } else {
           NewPN->addIncoming(BI->getCondition(), P);
         }
       }
-      
+
       BI->setCondition(NewPN);
       return true;
     }
   }
-  
+
   // If this is a conditional branch in an empty block, and if any
   // predecessors is a conditional branch to one of our destinations,
   // fold the conditions into logical ops and one cond br.
@@ -1991,11 +2143,11 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   if (&*BBI != BI)
     return false;
 
-  
+
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
     if (CE->canTrap())
       return false;
-  
+
   int PBIOp, BIOp;
   if (PBI->getSuccessor(0) == BI->getSuccessor(0))
     PBIOp = BIOp = 0;
@@ -2007,31 +2159,31 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
     PBIOp = BIOp = 1;
   else
     return false;
-    
+
   // Check to make sure that the other destination of this branch
   // isn't BB itself.  If so, this is an infinite loop that will
   // keep getting unwound.
   if (PBI->getSuccessor(PBIOp) == BB)
     return false;
-    
-  // Do not perform this transformation if it would require 
+
+  // Do not perform this transformation if it would require
   // insertion of a large number of select instructions. For targets
   // without predication/cmovs, this is a big pessimization.
   BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
-      
+
   unsigned NumPhis = 0;
   for (BasicBlock::iterator II = CommonDest->begin();
        isa<PHINode>(II); ++II, ++NumPhis)
     if (NumPhis > 2) // Disable this xform.
       return false;
-    
+
   // Finally, if everything is ok, fold the branches to logical ops.
   BasicBlock *OtherDest  = BI->getSuccessor(BIOp ^ 1);
-  
+
   DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
                << "AND: " << *BI->getParent());
-  
-  
+
+
   // If OtherDest *is* BB, then BB is a basic block with a single conditional
   // branch in it, where one edge (OtherDest) goes back to itself but the other
   // exits.  We don't *know* that the program avoids the infinite loop
@@ -2046,13 +2198,13 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
                                                   "infloop", BB->getParent());
     BranchInst::Create(InfLoopBlock, InfLoopBlock);
     OtherDest = InfLoopBlock;
-  }  
-  
+  }
+
   DEBUG(dbgs() << *PBI->getParent()->getParent());
 
   // BI may have other predecessors.  Because of this, we leave
   // it alone, but modify PBI.
-  
+
   // Make sure we get to CommonDest on True&True directions.
   Value *PBICond = PBI->getCondition();
   IRBuilder<true, NoFolder> Builder(PBI);
@@ -2065,16 +2217,16 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
 
   // Merge the conditions.
   Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
-  
+
   // Modify PBI to branch on the new condition to the new dests.
   PBI->setCondition(Cond);
   PBI->setSuccessor(0, CommonDest);
   PBI->setSuccessor(1, OtherDest);
-  
+
   // OtherDest may have phi nodes.  If so, add an entry from PBI's
   // block that are identical to the entries for BI's block.
   AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
-  
+
   // We know that the CommonDest already had an edge from PBI to
   // it.  If it has PHIs though, the PHIs may have different
   // entries for BB and PBI's BB.  If so, insert a select to make
@@ -2092,10 +2244,10 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
       PN->setIncomingValue(PBBIdx, NV);
     }
   }
-  
+
   DEBUG(dbgs() << "INTO: " << *PBI->getParent());
   DEBUG(dbgs() << *PBI->getParent()->getParent());
-  
+
   // This basic block is probably dead.  We know it has at least
   // one fewer predecessor.
   return true;
@@ -2214,7 +2366,7 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
 ///   br label %end
 /// end:
 ///   ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ]
-/// 
+///
 /// We prefer to split the edge to 'end' so that there is a true/false entry to
 /// the PHI, merging the third icmp into the switch.
 static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
@@ -2228,17 +2380,17 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
 
   Value *V = ICI->getOperand(0);
   ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
-  
+
   // The pattern we're looking for is where our only predecessor is a switch on
   // 'V' and this block is the default case for the switch.  In this case we can
   // fold the compared value into the switch to simplify things.
   BasicBlock *Pred = BB->getSinglePredecessor();
   if (Pred == 0 || !isa<SwitchInst>(Pred->getTerminator())) return false;
-  
+
   SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
   if (SI->getCondition() != V)
     return false;
-  
+
   // If BB is reachable on a non-default case, then we simply know the value of
   // V in this block.  Substitute it and constant fold the icmp instruction
   // away.
@@ -2246,7 +2398,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
     ConstantInt *VVal = SI->findCaseDest(BB);
     assert(VVal && "Should have a unique destination value");
     ICI->setOperand(0, VVal);
-    
+
     if (Value *V = SimplifyInstruction(ICI, TD)) {
       ICI->replaceAllUsesWith(V);
       ICI->eraseFromParent();
@@ -2254,7 +2406,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
     // BB is now empty, so it is likely to simplify away.
     return SimplifyCFG(BB) | true;
   }
-  
+
   // Ok, the block is reachable from the default dest.  If the constant we're
   // comparing exists in one of the other edges, then we can constant fold ICI
   // and zap it.
@@ -2264,13 +2416,13 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
       V = ConstantInt::getFalse(BB->getContext());
     else
       V = ConstantInt::getTrue(BB->getContext());
-    
+
     ICI->replaceAllUsesWith(V);
     ICI->eraseFromParent();
     // BB is now empty, so it is likely to simplify away.
     return SimplifyCFG(BB) | true;
   }
-  
+
   // The use of the icmp has to be in the 'end' block, by the only PHI node in
   // the block.
   BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
@@ -2297,7 +2449,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge",
                                          BB->getParent(), BB);
   SI->addCase(Cst, NewBB);
-  
+
   // NewBB branches to the phi block, add the uncond branch and the phi entry.
   Builder.SetInsertPoint(NewBB);
   Builder.SetCurrentDebugLocation(SI->getDebugLoc());
@@ -2313,8 +2465,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
                                       IRBuilder<> &Builder) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (Cond == 0) return false;
-  
-  
+
+
   // Change br (X == 0 | X == 1), T, F into a switch instruction.
   // If this is a bunch of seteq's or'd together, or if it's a bunch of
   // 'setne's and'ed together, collect them.
@@ -2323,7 +2475,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
   bool TrueWhenEqual = true;
   Value *ExtraCase = 0;
   unsigned UsedICmps = 0;
-  
+
   if (Cond->getOpcode() == Instruction::Or) {
     CompVal = GatherConstantCompares(Cond, Values, ExtraCase, TD, true,
                                      UsedICmps);
@@ -2332,7 +2484,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
                                      UsedICmps);
     TrueWhenEqual = false;
   }
-  
+
   // If we didn't have a multiply compared value, fail.
   if (CompVal == 0) return false;
 
@@ -2344,21 +2496,24 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
   // instruction can't handle, remove them now.
   array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate);
   Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
-  
+
   // If Extra was used, we require at least two switch values to do the
   // transformation.  A switch with one value is just an cond branch.
   if (ExtraCase && Values.size() < 2) return false;
-  
+
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
   // Figure out which block is which destination.
   BasicBlock *DefaultBB = BI->getSuccessor(1);
   BasicBlock *EdgeBB    = BI->getSuccessor(0);
   if (!TrueWhenEqual) std::swap(DefaultBB, EdgeBB);
-  
+
   BasicBlock *BB = BI->getParent();
-  
+
   DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
                << " cases into SWITCH.  BB is:\n" << *BB);
-  
+
   // If there are any extra values that couldn't be folded into the switch
   // then we evaluate them with an explicit branch first.  Split the block
   // right before the condbr to handle it.
@@ -2372,13 +2527,13 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
       Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
     else
       Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
-      
+
     OldTI->eraseFromParent();
-    
+
     // If there are PHI nodes in EdgeBB, then we need to add a new entry to them
     // for the edge we just added.
     AddPredecessorToBlock(EdgeBB, BB, NewBB);
-    
+
     DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
           << "\nEXTRABB = " << *BB);
     BB = NewBB;
@@ -2392,14 +2547,14 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
                                      TD->getIntPtrType(CompVal->getContext()),
                                      "magicptr");
   }
-  
+
   // Create the new switch instruction now.
   SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
 
   // Add all of the 'cases' to the switch instruction.
   for (unsigned i = 0, e = Values.size(); i != e; ++i)
     New->addCase(Values[i], EdgeBB);
-  
+
   // We added edges from PI to the EdgeBB.  As such, if there were any
   // PHI nodes in EdgeBB, they need entries to be added corresponding to
   // the number of edges added.
@@ -2410,10 +2565,10 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
     for (unsigned i = 0, e = Values.size()-1; i != e; ++i)
       PN->addIncoming(InVal, BB);
   }
-  
+
   // Erase the old branch instruction.
   EraseTerminatorInstAndDCECond(BI);
-  
+
   DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
   return true;
 }
@@ -2467,7 +2622,7 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
   if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
-  
+
   // Find predecessors that end with branches.
   SmallVector<BasicBlock*, 8> UncondBranchPreds;
   SmallVector<BranchInst*, 8> CondBranchPreds;
@@ -2481,7 +2636,7 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
         CondBranchPreds.push_back(BI);
     }
   }
-  
+
   // If we found some, do the transformation!
   if (!UncondBranchPreds.empty() && DupRet) {
     while (!UncondBranchPreds.empty()) {
@@ -2490,21 +2645,21 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
             << "INTO UNCOND BRANCH PRED: " << *Pred);
       (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
     }
-    
+
     // If we eliminated all predecessors of the block, delete the block now.
     if (pred_begin(BB) == pred_end(BB))
       // We know there are no successors, so just nuke the block.
       BB->eraseFromParent();
-    
+
     return true;
   }
-  
+
   // Check out all of the conditional branches going to this return
   // instruction.  If any of them just select between returns, change the
   // branch itself into a select/return pair.
   while (!CondBranchPreds.empty()) {
     BranchInst *BI = CondBranchPreds.pop_back_val();
-    
+
     // Check to see if the non-BB successor is also a return block.
     if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
         isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
@@ -2516,9 +2671,9 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
 
 bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   BasicBlock *BB = UI->getParent();
-  
+
   bool Changed = false;
-  
+
   // If there are any instructions immediately before the unreachable that can
   // be removed, do so.
   while (UI != BB->begin()) {
@@ -2558,11 +2713,11 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
     BBI->eraseFromParent();
     Changed = true;
   }
-  
+
   // If the unreachable instruction is the first in the block, take a gander
   // at all of the predecessors of this instruction, and simplify them.
   if (&BB->front() != UI) return Changed;
-  
+
   SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     TerminatorInst *TI = Preds[i]->getTerminator();
@@ -2615,7 +2770,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         BasicBlock *MaxBlock = 0;
         for (std::map<BasicBlock*, std::pair<unsigned, unsigned> >::iterator
              I = Popularity.begin(), E = Popularity.end(); I != E; ++I) {
-          if (I->second.first > MaxPop || 
+          if (I->second.first > MaxPop ||
               (I->second.first == MaxPop && MaxIndex > I->second.second)) {
             MaxPop = I->second.first;
             MaxIndex = I->second.second;
@@ -2627,13 +2782,13 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
           // edges to it.
           SI->setDefaultDest(MaxBlock);
           Changed = true;
-          
+
           // If MaxBlock has phinodes in it, remove MaxPop-1 entries from
           // it.
           if (isa<PHINode>(MaxBlock->begin()))
             for (unsigned i = 0; i != MaxPop-1; ++i)
               MaxBlock->removePredecessor(SI->getParent());
-          
+
           for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
                i != e; ++i)
             if (i.getCaseSuccessor() == MaxBlock) {
@@ -2648,7 +2803,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         // place to note that the call does not throw though.
         BranchInst *BI = Builder.CreateBr(II->getNormalDest());
         II->removeFromParent();   // Take out of symbol table
-        
+
         // Insert the call now...
         SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3);
         Builder.SetInsertPoint(BI);
@@ -2663,7 +2818,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
       }
     }
   }
-  
+
   // If this block is now dead, remove it.
   if (pred_begin(BB) == pred_end(BB) &&
       BB != &BB->getParent()->getEntryBlock()) {
@@ -2823,6 +2978,285 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   return Changed;
 }
 
+/// ValidLookupTableConstant - Return true if the backend will be able to handle
+/// initializing an array of constants like C.
+static bool ValidLookupTableConstant(Constant *C) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    return CE->isGEPWithNoNotionalOverIndexing();
+
+  return isa<ConstantFP>(C) ||
+      isa<ConstantInt>(C) ||
+      isa<ConstantPointerNull>(C) ||
+      isa<GlobalValue>(C) ||
+      isa<UndefValue>(C);
+}
+
+/// GetCaseResulsts - Try to determine the resulting constant values in phi
+/// nodes at the common destination basic block for one of the case
+/// destinations of a switch instruction.
+static bool GetCaseResults(SwitchInst *SI,
+                           BasicBlock *CaseDest,
+                           BasicBlock **CommonDest,
+                           SmallVector<std::pair<PHINode*,Constant*>, 4> &Res) {
+  // The block from which we enter the common destination.
+  BasicBlock *Pred = SI->getParent();
+
+  // If CaseDest is empty, continue to its successor.
+  if (CaseDest->getFirstNonPHIOrDbg() == CaseDest->getTerminator() &&
+      !isa<PHINode>(CaseDest->begin())) {
+
+    TerminatorInst *Terminator = CaseDest->getTerminator();
+    if (Terminator->getNumSuccessors() != 1)
+      return false;
+
+    Pred = CaseDest;
+    CaseDest = Terminator->getSuccessor(0);
+  }
+
+  // If we did not have a CommonDest before, use the current one.
+  if (!*CommonDest)
+    *CommonDest = CaseDest;
+  // If the destination isn't the common one, abort.
+  if (CaseDest != *CommonDest)
+    return false;
+
+  // Get the values for this case from phi nodes in the destination block.
+  BasicBlock::iterator I = (*CommonDest)->begin();
+  while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
+    int Idx = PHI->getBasicBlockIndex(Pred);
+    if (Idx == -1)
+      continue;
+
+    Constant *ConstVal = dyn_cast<Constant>(PHI->getIncomingValue(Idx));
+    if (!ConstVal)
+      return false;
+
+    // Be conservative about which kinds of constants we support.
+    if (!ValidLookupTableConstant(ConstVal))
+      return false;
+
+    Res.push_back(std::make_pair(PHI, ConstVal));
+  }
+
+  return true;
+}
+
+/// BuildLookupTable - Build a lookup table with the contents of Results, using
+/// DefaultResult to fill the holes in the table. If the table ends up
+/// containing the same result in each element, set *SingleResult to that value
+/// and return NULL.
+static GlobalVariable *BuildLookupTable(Module &M,
+                                        uint64_t TableSize,
+                                        ConstantInt *Offset,
+              const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Results,
+                                        Constant *DefaultResult,
+                                        Constant **SingleResult) {
+  assert(Results.size() && "Need values to build lookup table");
+  assert(TableSize >= Results.size() && "Table needs to hold all values");
+
+  // If all values in the table are equal, this is that value.
+  Constant *SameResult = Results.begin()->second;
+
+  // Build up the table contents.
+  std::vector<Constant*> TableContents(TableSize);
+  for (size_t I = 0, E = Results.size(); I != E; ++I) {
+    ConstantInt *CaseVal = Results[I].first;
+    Constant *CaseRes = Results[I].second;
+
+    uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
+    TableContents[Idx] = CaseRes;
+
+    if (CaseRes != SameResult)
+      SameResult = NULL;
+  }
+
+  // Fill in any holes in the table with the default result.
+  if (Results.size() < TableSize) {
+    for (unsigned i = 0; i < TableSize; ++i) {
+      if (!TableContents[i])
+        TableContents[i] = DefaultResult;
+    }
+
+    if (DefaultResult != SameResult)
+      SameResult = NULL;
+  }
+
+  // Same result was used in the entire table; just return that.
+  if (SameResult) {
+    *SingleResult = SameResult;
+    return NULL;
+  }
+
+  ArrayType *ArrayTy = ArrayType::get(DefaultResult->getType(), TableSize);
+  Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
+
+  GlobalVariable *GV = new GlobalVariable(M, ArrayTy, /*constant=*/ true,
+                                          GlobalVariable::PrivateLinkage,
+                                          Initializer,
+                                          "switch.table");
+  GV->setUnnamedAddr(true);
+  return GV;
+}
+
+/// SwitchToLookupTable - If the switch is only used to initialize one or more
+/// phi nodes in a common successor block with different constant values,
+/// replace the switch with lookup tables.
+static bool SwitchToLookupTable(SwitchInst *SI,
+                                IRBuilder<> &Builder) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+  // FIXME: Handle unreachable cases.
+
+  // FIXME: If the switch is too sparse for a lookup table, perhaps we could
+  // split off a dense part and build a lookup table for that.
+
+  // FIXME: If the results are all integers and the lookup table would fit in a
+  // target-legal register, we should store them as a bitmap and use shift/mask
+  // to look up the result.
+
+  // FIXME: This creates arrays of GEPs to constant strings, which means each
+  // GEP needs a runtime relocation in PIC code. We should just build one big
+  // string and lookup indices into that.
+
+  // Ignore the switch if the number of cases are too small.
+  // This is similar to the check when building jump tables in
+  // SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Determine the best cut-off.
+  if (SI->getNumCases() < 4)
+    return false;
+
+  // Figure out the corresponding result for each case value and phi node in the
+  // common destination, as well as the the min and max case values.
+  assert(SI->case_begin() != SI->case_end());
+  SwitchInst::CaseIt CI = SI->case_begin();
+  ConstantInt *MinCaseVal = CI.getCaseValue();
+  ConstantInt *MaxCaseVal = CI.getCaseValue();
+
+  BasicBlock *CommonDest = NULL;
+  typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy;
+  SmallDenseMap<PHINode*, ResultListTy> ResultLists;
+  SmallDenseMap<PHINode*, Constant*> DefaultResults;
+  SmallDenseMap<PHINode*, Type*> ResultTypes;
+  SmallVector<PHINode*, 4> PHIs;
+
+  for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
+    ConstantInt *CaseVal = CI.getCaseValue();
+    if (CaseVal->getValue().slt(MinCaseVal->getValue()))
+      MinCaseVal = CaseVal;
+    if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
+      MaxCaseVal = CaseVal;
+
+    // Resulting value at phi nodes for this case value.
+    typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
+    ResultsTy Results;
+    if (!GetCaseResults(SI, CI.getCaseSuccessor(), &CommonDest, Results))
+      return false;
+
+    // Append the result from this case to the list for each phi.
+    for (ResultsTy::iterator I = Results.begin(), E = Results.end(); I!=E; ++I) {
+      if (!ResultLists.count(I->first))
+        PHIs.push_back(I->first);
+      ResultLists[I->first].push_back(std::make_pair(CaseVal, I->second));
+    }
+  }
+
+  // Get the resulting values for the default case.
+  SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
+  if (!GetCaseResults(SI, SI->getDefaultDest(), &CommonDest, DefaultResultsList))
+    return false;
+  for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) {
+    PHINode *PHI = DefaultResultsList[I].first;
+    Constant *Result = DefaultResultsList[I].second;
+    DefaultResults[PHI] = Result;
+    ResultTypes[PHI] = Result->getType();
+  }
+
+  APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
+  // The table density should be at lest 40%. This is the same criterion as for
+  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Find the best cut-off.
+  // Be careful to avoid overlow in the density computation.
+  if (RangeSpread.zextOrSelf(64).ugt(UINT64_MAX / 4 - 1))
+    return false;
+  uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
+  if (SI->getNumCases() * 10 < TableSize * 4)
+    return false;
+
+  // Build the lookup tables.
+  SmallDenseMap<PHINode*, GlobalVariable*> LookupTables;
+  SmallDenseMap<PHINode*, Constant*> SingleResults;
+
+  Module &Mod = *CommonDest->getParent()->getParent();
+  for (SmallVector<PHINode*, 4>::iterator I = PHIs.begin(), E = PHIs.end();
+       I != E; ++I) {
+    PHINode *PHI = *I;
+
+    Constant *SingleResult = NULL;
+    LookupTables[PHI] = BuildLookupTable(Mod, TableSize, MinCaseVal,
+                                         ResultLists[PHI], DefaultResults[PHI],
+                                         &SingleResult);
+    SingleResults[PHI] = SingleResult;
+  }
+
+  // Create the BB that does the lookups.
+  BasicBlock *LookupBB = BasicBlock::Create(Mod.getContext(),
+                                            "switch.lookup",
+                                            CommonDest->getParent(),
+                                            CommonDest);
+
+  // Check whether the condition value is within the case range, and branch to
+  // the new BB.
+  Builder.SetInsertPoint(SI);
+  Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
+                                        "switch.tableidx");
+  Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
+      MinCaseVal->getType(), TableSize));
+  Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+
+  // Populate the BB that does the lookups.
+  Builder.SetInsertPoint(LookupBB);
+  bool ReturnedEarly = false;
+  for (SmallVector<PHINode*, 4>::iterator I = PHIs.begin(), E = PHIs.end();
+       I != E; ++I) {
+    PHINode *PHI = *I;
+    // There was a single result for this phi; just use that.
+    if (Constant *SingleResult = SingleResults[PHI]) {
+      PHI->addIncoming(SingleResult, LookupBB);
+      continue;
+    }
+
+    Value *GEPIndices[] = { Builder.getInt32(0), TableIndex };
+    Value *GEP = Builder.CreateInBoundsGEP(LookupTables[PHI], GEPIndices,
+                                           "switch.gep");
+    Value *Result = Builder.CreateLoad(GEP, "switch.load");
+
+    // If the result is only going to be used to return from the function,
+    // we want to do that right here.
+    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin())) {
+      if (CommonDest->getFirstNonPHIOrDbg() == CommonDest->getTerminator()) {
+        Builder.CreateRet(Result);
+        ReturnedEarly = true;
+      }
+    }
+
+    if (!ReturnedEarly)
+      PHI->addIncoming(Result, LookupBB);
+  }
+
+  if (!ReturnedEarly)
+    Builder.CreateBr(CommonDest);
+
+  // Remove the switch.
+  for (unsigned i = 0; i < SI->getNumSuccessors(); ++i) {
+    BasicBlock *Succ = SI->getSuccessor(i);
+    if (Succ == SI->getDefaultDest()) continue;
+    Succ->removePredecessor(SI->getParent());
+  }
+  SI->eraseFromParent();
+
+  ++NumLookupTables;
+  return true;
+}
+
 bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   // If this switch is too complex to want to look at, ignore it.
   if (!isValueEqualityComparison(SI))
@@ -2862,13 +3296,16 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (ForwardSwitchConditionToPHI(SI))
     return SimplifyCFG(BB) | true;
 
+  if (SwitchToLookupTable(SI, Builder))
+    return SimplifyCFG(BB) | true;
+
   return false;
 }
 
 bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
   BasicBlock *BB = IBI->getParent();
   bool Changed = false;
-  
+
   // Eliminate redundant destinations.
   SmallPtrSet<Value *, 8> Succs;
   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
@@ -2879,7 +3316,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
       --i; --e;
       Changed = true;
     }
-  } 
+  }
 
   if (IBI->getNumDestinations() == 0) {
     // If the indirectbr has no successors, change it to unreachable.
@@ -2887,14 +3324,14 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
     EraseTerminatorInstAndDCECond(IBI);
     return true;
   }
-  
+
   if (IBI->getNumDestinations() == 1) {
     // If the indirectbr has one successor, change it to a direct branch.
     BranchInst::Create(IBI->getDestination(0), IBI);
     EraseTerminatorInstAndDCECond(IBI);
     return true;
   }
-  
+
   if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
     if (SimplifyIndirectBrOnSelect(IBI, SI))
       return SimplifyCFG(BB) | true;
@@ -2904,13 +3341,13 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 
 bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   BasicBlock *BB = BI->getParent();
-  
+
   // If the Terminator is the only non-phi instruction, simplify the block.
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbgOrLifetime();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
       TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
-  
+
   // If the only instruction in the block is a seteq/setne comparison
   // against a constant, try to simplify the block.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
@@ -2921,7 +3358,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
           TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder))
         return true;
     }
-  
+
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and our successor, fold the comparison into the
   // predecessor and use logical operations to update the incoming value
@@ -2934,7 +3371,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
 
 bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
-  
+
   // Conditional branch
   if (isValueEqualityComparison(BI)) {
     // If we only have one predecessor, and if it is a branch on this value,
@@ -2943,7 +3380,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
         return SimplifyCFG(BB) | true;
-    
+
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
     BasicBlock::iterator I = BB->begin();
@@ -2962,17 +3399,17 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
         return SimplifyCFG(BB) | true;
     }
   }
-  
+
   // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
   if (SimplifyBranchOnICmpChain(BI, TD, Builder))
     return true;
-  
+
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
   if (FoldBranchToCommonDest(BI))
     return SimplifyCFG(BB) | true;
-  
+
   // We have a conditional branch to two blocks that are only reachable
   // from BI.  We know that the condbr dominates the two blocks, so see if
   // there is any identical code in the "then" and "else" blocks.  If so, we
@@ -2999,14 +3436,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
         return SimplifyCFG(BB) | true;
   }
-  
+
   // If this is a branch on a phi node in the current block, thread control
   // through this block if any PHI node entries are constants.
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
       if (FoldCondBranchOnPHI(BI, TD))
         return SimplifyCFG(BB) | true;
-  
+
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
@@ -3114,7 +3551,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   //
   if (MergeBlockIntoPredecessor(BB))
     return true;
-  
+
   IRBuilder<> Builder(BB);
 
   // If there is a trivial two-entry PHI node in this basic block, and we can
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 81eb9e0..528e6a1 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -72,7 +72,7 @@ namespace {
                 ++NumSimplified;
                 Changed = true;
               }
-            Changed |= RecursivelyDeleteTriviallyDeadInstructions(I);
+            Changed |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
           }
 
         // Place the list of instructions to simplify on the next loop iteration
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 62d23cb..c09dcd2 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -601,7 +601,7 @@ namespace {
 
     // It is important to cleanup here so that future iterations of this
     // function have less work to do.
-    (void) SimplifyInstructionsInBlock(&BB, TD);
+    (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo());
     return true;
   }