Update aosp/master llvm for rebase to r233350

Change-Id: I07d935f8793ee8ec6b7da003f6483046594bca49
author: Pirama Arumuga Nainar <pirama@google.com> 2015-04-08 08:55:49 -0700
committer: Pirama Arumuga Nainar <pirama@google.com> 2015-04-09 15:04:38 -0700
commit: 4c5e43da7792f75567b693105cc53e3f1992ad98 (patch)
tree: 1b2c9792582e12f5af0b1512e3094425f0dc0df9 /lib/Transforms
parent: c75239e6119d0f9a74c57099d91cbc9bde56bf33 (diff)
download: external_llvm-4c5e43da7792f75567b693105cc53e3f1992ad98.zip
external_llvm-4c5e43da7792f75567b693105cc53e3f1992ad98.tar.gz
external_llvm-4c5e43da7792f75567b693105cc53e3f1992ad98.tar.bz2
105 files changed, 6352 insertions, 4297 deletions
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 7e48ce3..46480bd 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -69,16 +69,15 @@ namespace {
     bool runOnSCC(CallGraphSCC &SCC) override;
     static char ID; // Pass identification, replacement for typeid
     explicit ArgPromotion(unsigned maxElements = 3)
-        : CallGraphSCCPass(ID), DL(nullptr), maxElements(maxElements) {
+        : CallGraphSCCPass(ID), maxElements(maxElements) {
       initializeArgPromotionPass(*PassRegistry::getPassRegistry());
     }
 
     /// A vector used to hold the indices of a single GEP instruction
     typedef std::vector<uint64_t> IndicesVector;
 
-    const DataLayout *DL;
   private:
-    bool isDenselyPacked(Type *type);
+    bool isDenselyPacked(Type *type, const DataLayout &DL);
     bool canPaddingBeAccessed(Argument *Arg);
     CallGraphNode *PromoteArguments(CallGraphNode *CGN);
     bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const;
@@ -109,9 +108,6 @@ Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
 bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
   bool Changed = false, LocalChange;
 
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
-
   do {  // Iterate until we stop promoting from this SCC.
     LocalChange = false;
     // Attempt to promote arguments from all functions in this SCC.
@@ -128,7 +124,7 @@ bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
 }
 
 /// \brief Checks if a type could have padding bytes.
-bool ArgPromotion::isDenselyPacked(Type *type) {
+bool ArgPromotion::isDenselyPacked(Type *type, const DataLayout &DL) {
 
   // There is no size information, so be conservative.
   if (!type->isSized())
@@ -136,7 +132,7 @@ bool ArgPromotion::isDenselyPacked(Type *type) {
 
   // If the alloc size is not equal to the storage size, then there are padding
   // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
-  if (!DL || DL->getTypeSizeInBits(type) != DL->getTypeAllocSizeInBits(type))
+  if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
     return false;
 
   if (!isa<CompositeType>(type))
@@ -144,19 +140,20 @@ bool ArgPromotion::isDenselyPacked(Type *type) {
 
   // For homogenous sequential types, check for padding within members.
   if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
-    return isa<PointerType>(seqTy) || isDenselyPacked(seqTy->getElementType());
+    return isa<PointerType>(seqTy) ||
+           isDenselyPacked(seqTy->getElementType(), DL);
 
   // Check for padding within and between elements of a struct.
   StructType *StructTy = cast<StructType>(type);
-  const StructLayout *Layout = DL->getStructLayout(StructTy);
+  const StructLayout *Layout = DL.getStructLayout(StructTy);
   uint64_t StartPos = 0;
   for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
     Type *ElTy = StructTy->getElementType(i);
-    if (!isDenselyPacked(ElTy))
+    if (!isDenselyPacked(ElTy, DL))
       return false;
     if (StartPos != Layout->getElementOffsetInBits(i))
       return false;
-    StartPos += DL->getTypeAllocSizeInBits(ElTy);
+    StartPos += DL.getTypeAllocSizeInBits(ElTy);
   }
 
   return true;
@@ -236,6 +233,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   // IR, while in the callee the classification is determined dynamically based
   // on the number of registers consumed so far.
   if (F->isVarArg()) return nullptr;
+  const DataLayout &DL = F->getParent()->getDataLayout();
 
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
@@ -250,8 +248,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
     // packed or if we can prove the padding bytes are never accessed. This does
     // not apply to inalloca.
     bool isSafeToPromote =
-      PtrArg->hasByValAttr() &&
-      (isDenselyPacked(AgTy) || !canPaddingBeAccessed(PtrArg));
+        PtrArg->hasByValAttr() &&
+        (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg));
     if (isSafeToPromote) {
       if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         if (maxElements > 0 && STy->getNumElements() > maxElements) {
@@ -310,9 +308,9 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
 
 /// AllCallersPassInValidPointerForArgument - Return true if we can prove that
 /// all callees pass in a valid pointer for the specified function argument.
-static bool AllCallersPassInValidPointerForArgument(Argument *Arg,
-                                                    const DataLayout *DL) {
+static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
   Function *Callee = Arg->getParent();
+  const DataLayout &DL = Callee->getParent()->getDataLayout();
 
   unsigned ArgNo = Arg->getArgNo();
 
@@ -430,7 +428,7 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg,
   GEPIndicesSet ToPromote;
 
   // If the pointer is always valid, any load with first index 0 is valid.
-  if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg, DL))
+  if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg))
     SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
 
   // First, iterate the entry block and mark loads of (geps of) arguments as
@@ -586,7 +584,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   FunctionType *FTy = F->getFunctionType();
   std::vector<Type*> Params;
 
-  typedef std::set<IndicesVector> ScalarizeTable;
+  typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable;
 
   // ScalarizedElements - If we are promoting a pointer that has elements
   // accessed out of it, keep track of which elements are accessed so that we
@@ -623,8 +621,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       // Simple byval argument? Just add all the struct element types.
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
       StructType *STy = cast<StructType>(AgTy);
-      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-        Params.push_back(STy->getElementType(i));
+      Params.insert(Params.end(), STy->element_begin(), STy->element_end());
       ++NumByValArgsPromoted;
     } else if (!ArgsToPromote.count(I)) {
       // Unchanged argument
@@ -647,7 +644,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       ScalarizeTable &ArgIndices = ScalarizedElements[I];
       for (User *U : I->users()) {
         Instruction *UI = cast<Instruction>(U);
-        assert(isa<LoadInst>(UI) || isa<GetElementPtrInst>(UI));
+        Type *SrcTy;
+        if (LoadInst *L = dyn_cast<LoadInst>(UI))
+          SrcTy = L->getType();
+        else
+          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
         IndicesVector Indices;
         Indices.reserve(UI->getNumOperands() - 1);
         // Since loads will only have a single operand, and GEPs only a single
@@ -659,7 +660,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         // GEPs with a single 0 index can be merged with direct loads
         if (Indices.size() == 1 && Indices.front() == 0)
           Indices.clear();
-        ArgIndices.insert(Indices);
+        ArgIndices.insert(std::make_pair(SrcTy, Indices));
         LoadInst *OrigLoad;
         if (LoadInst *L = dyn_cast<LoadInst>(UI))
           OrigLoad = L;
@@ -673,11 +674,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       for (ScalarizeTable::iterator SI = ArgIndices.begin(),
              E = ArgIndices.end(); SI != E; ++SI) {
         // not allowed to dereference ->begin() if size() is 0
-        Params.push_back(GetElementPtrInst::getIndexedType(I->getType(), *SI));
+        Params.push_back(
+            GetElementPtrInst::getIndexedType(I->getType(), SI->second));
         assert(Params.back());
       }
 
-      if (ArgIndices.size() == 1 && ArgIndices.begin()->empty())
+      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
         ++NumArgumentsPromoted;
       else
         ++NumAggregatesPromoted;
@@ -768,9 +770,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
               ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr };
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-          Value *Idx = GetElementPtrInst::Create(*AI, Idxs,
-                                                 (*AI)->getName()+"."+utostr(i),
-                                                 Call);
+          Value *Idx = GetElementPtrInst::Create(
+              STy, *AI, Idxs, (*AI)->getName() + "." + utostr(i), Call);
           // TODO: Tell AA about the new values?
           Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call));
         }
@@ -783,12 +784,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         for (ScalarizeTable::iterator SI = ArgIndices.begin(),
                E = ArgIndices.end(); SI != E; ++SI) {
           Value *V = *AI;
-          LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, *SI)];
-          if (!SI->empty()) {
-            Ops.reserve(SI->size());
+          LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, SI->second)];
+          if (!SI->second.empty()) {
+            Ops.reserve(SI->second.size());
             Type *ElTy = V->getType();
-            for (IndicesVector::const_iterator II = SI->begin(),
-                 IE = SI->end(); II != IE; ++II) {
+            for (IndicesVector::const_iterator II = SI->second.begin(),
+                                               IE = SI->second.end();
+                 II != IE; ++II) {
               // Use i32 to index structs, and i64 for others (pointers/arrays).
               // This satisfies GEP constraints.
               Type *IdxTy = (ElTy->isStructTy() ?
@@ -799,7 +801,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
               ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II);
             }
             // And create a GEP to extract those indices.
-            V = GetElementPtrInst::Create(V, Ops, V->getName()+".idx", Call);
+            V = GetElementPtrInst::Create(SI->first, V, Ops,
+                                          V->getName() + ".idx", Call);
             Ops.clear();
             AA.copyValue(OrigLoad->getOperand(0), V);
           }
@@ -903,10 +906,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
         Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-        Value *Idx = 
-          GetElementPtrInst::Create(TheAlloca, Idxs,
-                                    TheAlloca->getName()+"."+Twine(i), 
-                                    InsertPt);
+        Value *Idx = GetElementPtrInst::Create(
+            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
+            InsertPt);
         I2->setName(I->getName()+"."+Twine(i));
         new StoreInst(I2++, Idx, InsertPt);
       }
@@ -939,7 +941,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 
     while (!I->use_empty()) {
       if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
-        assert(ArgIndices.begin()->empty() &&
+        assert(ArgIndices.begin()->second.empty() &&
                "Load element should sort to front!");
         I2->setName(I->getName()+".val");
         LI->replaceAllUsesWith(I2);
@@ -961,7 +963,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 
         Function::arg_iterator TheArg = I2;
         for (ScalarizeTable::iterator It = ArgIndices.begin();
-             *It != Operands; ++It, ++TheArg) {
+             It->second != Operands; ++It, ++TheArg) {
           assert(It != ArgIndices.end() && "GEP not handled??");
         }
 
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index 0b6ade9..8ce7646 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -52,7 +52,6 @@ namespace {
     // alignment to a concrete value.
     unsigned getAlignment(GlobalVariable *GV) const;
 
-    const DataLayout *DL;
   };
 }
 
@@ -89,32 +88,22 @@ static bool IsBetterCanonical(const GlobalVariable &A,
   return A.hasUnnamedAddr();
 }
 
-bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const {
-  return DL || GV->getAlignment() != 0;
-}
-
 unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
   unsigned Align = GV->getAlignment();
   if (Align)
     return Align;
-  if (DL)
-    return DL->getPreferredAlignment(GV);
-  return 0;
+  return GV->getParent()->getDataLayout().getPreferredAlignment(GV);
 }
 
 bool ConstantMerge::runOnModule(Module &M) {
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   // Find all the globals that are marked "used".  These cannot be merged.
   SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
   FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
   FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals);
-  
-  // Map unique <constants, has-unknown-alignment> pairs to globals.  We don't
-  // want to merge globals of unknown alignment with those of explicit
-  // alignment.  If we have DataLayout, we always know the alignment.
-  DenseMap<PointerIntPair<Constant*, 1, bool>, GlobalVariable*> CMap;
+
+  // Map unique constants to globals.
+  DenseMap<Constant *, GlobalVariable *> CMap;
 
   // Replacements - This vector contains a list of replacements to perform.
   SmallVector<std::pair<GlobalVariable*, GlobalVariable*>, 32> Replacements;
@@ -156,8 +145,7 @@ bool ConstantMerge::runOnModule(Module &M) {
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
-      PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV));
-      GlobalVariable *&Slot = CMap[Pair];
+      GlobalVariable *&Slot = CMap[Init];
 
       // If this is the first constant we find or if the old one is local,
       // replace with the current one. If the current is externally visible
@@ -188,8 +176,7 @@ bool ConstantMerge::runOnModule(Module &M) {
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
-      PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV));
-      GlobalVariable *Slot = CMap[Pair];
+      GlobalVariable *Slot = CMap[Init];
 
       if (!Slot || Slot == GV)
         continue;
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 0c844fe..ba04c80 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Pass.h"
+#include <unordered_map>
 using namespace llvm;
 
 #define DEBUG_TYPE "globaldce"
@@ -47,6 +48,7 @@ namespace {
   private:
     SmallPtrSet<GlobalValue*, 32> AliveGlobals;
     SmallPtrSet<Constant *, 8> SeenConstants;
+    std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
 
     /// GlobalIsNeeded - mark the specific global value as needed, and
     /// recursively mark anything that it uses as also needed.
@@ -78,6 +80,17 @@ bool GlobalDCE::runOnModule(Module &M) {
   // Remove empty functions from the global ctors list.
   Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
 
+  // Collect the set of members for each comdat.
+  for (Function &F : M)
+    if (Comdat *C = F.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &F));
+  for (GlobalVariable &GV : M.globals())
+    if (Comdat *C = GV.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GV));
+  for (GlobalAlias &GA : M.aliases())
+    if (Comdat *C = GA.getComdat())
+      ComdatMembers.insert(std::make_pair(C, &GA));
+
   // Loop over the module, adding globals which are obviously necessary.
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     Changed |= RemoveUnusedGlobalValue(*I);
@@ -177,6 +190,7 @@ bool GlobalDCE::runOnModule(Module &M) {
   // Make sure that all memory is released
   AliveGlobals.clear();
   SeenConstants.clear();
+  ComdatMembers.clear();
 
   return Changed;
 }
@@ -188,17 +202,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
   if (!AliveGlobals.insert(G).second)
     return;
 
-  Module *M = G->getParent();
   if (Comdat *C = G->getComdat()) {
-    for (Function &F : *M)
-      if (F.getComdat() == C)
-        GlobalIsNeeded(&F);
-    for (GlobalVariable &GV : M->globals())
-      if (GV.getComdat() == C)
-        GlobalIsNeeded(&GV);
-    for (GlobalAlias &GA : M->aliases())
-      if (GA.getComdat() == C)
-        GlobalIsNeeded(&GA);
+    for (auto &&CM : make_range(ComdatMembers.equal_range(C)))
+      GlobalIsNeeded(CM.second);
   }
 
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) {
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 45e04f1..20b41fb 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
@@ -38,7 +39,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -86,7 +86,6 @@ namespace {
                                const GlobalStatus &GS);
     bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
 
-    const DataLayout *DL;
     TargetLibraryInfo *TLI;
     SmallSet<const Comdat *, 8> NotDiscardableComdats;
   };
@@ -269,7 +268,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
 /// quick scan over the use list to clean up the easy and obvious cruft.  This
 /// returns true if it made a change.
 static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
-                                       const DataLayout *DL,
+                                       const DataLayout &DL,
                                        TargetLibraryInfo *TLI) {
   bool Changed = false;
   // Note that we need to use a weak value handle for the worklist items. When
@@ -318,8 +317,8 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       // and will invalidate our notion of what Init is.
       Constant *SubInit = nullptr;
       if (!isa<ConstantExpr>(GEP->getOperand(0))) {
-        ConstantExpr *CE =
-          dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, DL, TLI));
+        ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
+            ConstantFoldInstruction(GEP, DL, TLI));
         if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
 
@@ -580,8 +579,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
         Idxs.push_back(NullInt);
         for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
           Idxs.push_back(GEPI->getOperand(i));
-        NewPtr = GetElementPtrInst::Create(NewPtr, Idxs,
-                                           GEPI->getName()+"."+Twine(Val),GEPI);
+        NewPtr = GetElementPtrInst::Create(
+            NewPtr->getType()->getPointerElementType(), NewPtr, Idxs,
+            GEPI->getName() + "." + Twine(Val), GEPI);
       }
     }
     GEP->replaceAllUsesWith(NewPtr);
@@ -739,7 +739,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
 /// if the loaded value is dynamically null, then we know that they cannot be
 /// reachable with a null optimize away the load.
 static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
-                                            const DataLayout *DL,
+                                            const DataLayout &DL,
                                             TargetLibraryInfo *TLI) {
   bool Changed = false;
 
@@ -802,7 +802,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
 
 /// ConstantPropUsersOf - Walk the use list of V, constant folding all of the
 /// instructions that are foldable.
-static void ConstantPropUsersOf(Value *V, const DataLayout *DL,
+static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
                                 TargetLibraryInfo *TLI) {
   for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; )
     if (Instruction *I = dyn_cast<Instruction>(*UI++))
@@ -822,12 +822,10 @@ static void ConstantPropUsersOf(Value *V, const DataLayout *DL,
 /// the specified malloc.  Because it is always the result of the specified
 /// malloc, there is no reason to actually DO the malloc.  Instead, turn the
 /// malloc into a global, and any loads of GV as uses of the new global.
-static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
-                                                     CallInst *CI,
-                                                     Type *AllocTy,
-                                                     ConstantInt *NElements,
-                                                     const DataLayout *DL,
-                                                     TargetLibraryInfo *TLI) {
+static GlobalVariable *
+OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
+                              ConstantInt *NElements, const DataLayout &DL,
+                              TargetLibraryInfo *TLI) {
   DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI << '\n');
 
   Type *GlobalType;
@@ -1167,7 +1165,8 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
                                            InsertedScalarizedValues,
                                            PHIsToRewrite),
                           LI->getName()+".f"+Twine(FieldNo), LI);
-  } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+  } else {
+    PHINode *PN = cast<PHINode>(V);
     // PN's type is pointer to struct.  Make a new PHI of pointer to struct
     // field.
 
@@ -1181,8 +1180,6 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
                      PN->getName()+".f"+Twine(FieldNo), PN);
     Result = NewPN;
     PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
-  } else {
-    llvm_unreachable("Unknown usable value");
   }
 
   return FieldVals[FieldNo] = Result;
@@ -1224,7 +1221,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
     GEPIdx.push_back(GEPI->getOperand(1));
     GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
 
-    Value *NGEPI = GetElementPtrInst::Create(NewPtr, GEPIdx,
+    Value *NGEPI = GetElementPtrInst::Create(GEPI->getResultElementType(), NewPtr, GEPIdx,
                                              GEPI->getName(), GEPI);
     GEPI->replaceAllUsesWith(NGEPI);
     GEPI->eraseFromParent();
@@ -1271,7 +1268,7 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
 /// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break
 /// it up into multiple allocations of arrays of the fields.
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
-                                            Value *NElems, const DataLayout *DL,
+                                            Value *NElems, const DataLayout &DL,
                                             const TargetLibraryInfo *TLI) {
   DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
   Type *MAT = getMallocAllocatedType(CI, TLI);
@@ -1301,10 +1298,10 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
                          GV->getThreadLocalMode());
     FieldGlobals.push_back(NGV);
 
-    unsigned TypeSize = DL->getTypeAllocSize(FieldTy);
+    unsigned TypeSize = DL.getTypeAllocSize(FieldTy);
     if (StructType *ST = dyn_cast<StructType>(FieldTy))
-      TypeSize = DL->getStructLayout(ST)->getSizeInBytes();
-    Type *IntPtrTy = DL->getIntPtrType(CI->getType());
+      TypeSize = DL.getStructLayout(ST)->getSizeInBytes();
+    Type *IntPtrTy = DL.getIntPtrType(CI->getType());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
                                         NElems, nullptr,
@@ -1459,16 +1456,12 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 /// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a
 /// pointer global variable with a single value stored it that is a malloc or
 /// cast of malloc.
-static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
-                                               CallInst *CI,
+static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
                                                Type *AllocTy,
                                                AtomicOrdering Ordering,
                                                Module::global_iterator &GVI,
-                                               const DataLayout *DL,
+                                               const DataLayout &DL,
                                                TargetLibraryInfo *TLI) {
-  if (!DL)
-    return false;
-
   // If this is a malloc of an abstract type, don't touch it.
   if (!AllocTy->isSized())
     return false;
@@ -1504,7 +1497,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
     // Restrict this transformation to only working on small allocations
     // (2048 bytes currently), as we don't want to introduce a 16M global or
     // something.
-    if (NElements->getZExtValue() * DL->getTypeAllocSize(AllocTy) < 2048) {
+    if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) {
       GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
       return true;
     }
@@ -1534,8 +1527,8 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
     if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
-      Type *IntPtrTy = DL->getIntPtrType(CI->getType());
-      unsigned TypeSize = DL->getStructLayout(AllocSTy)->getSizeInBytes();
+      Type *IntPtrTy = DL.getIntPtrType(CI->getType());
+      unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
       Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy,
@@ -1563,7 +1556,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
                                      AtomicOrdering Ordering,
                                      Module::global_iterator &GVI,
-                                     const DataLayout *DL,
+                                     const DataLayout &DL,
                                      TargetLibraryInfo *TLI) {
   // Ignore no-op GEPs and bitcasts.
   StoredOnceVal = StoredOnceVal->stripPointerCasts();
@@ -1733,6 +1726,7 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
 bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
                                       Module::global_iterator &GVI,
                                       const GlobalStatus &GS) {
+  auto &DL = GV->getParent()->getDataLayout();
   // If this is a first class global and has only one accessing function
   // and this function is main (which we know is not recursive), we replace
   // the global with a local alloca in this function.
@@ -1804,12 +1798,10 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     ++NumMarked;
     return true;
   } else if (!GV->getInitializer()->getType()->isSingleValueType()) {
-    if (DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>()) {
-      const DataLayout &DL = DLP->getDataLayout();
-      if (GlobalVariable *FirstNewGV = SRAGlobal(GV, DL)) {
-        GVI = FirstNewGV;  // Don't skip the newly produced globals!
-        return true;
-      }
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+    if (GlobalVariable *FirstNewGV = SRAGlobal(GV, DL)) {
+      GVI = FirstNewGV; // Don't skip the newly produced globals!
+      return true;
     }
   } else if (GS.StoredType == GlobalStatus::StoredOnce) {
     // If the initial value for the global was an undef value, and if only
@@ -1954,6 +1946,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
     // Simplify the initializer.
     if (GV->hasInitializer())
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GV->getInitializer())) {
+        auto &DL = M.getDataLayout();
         Constant *New = ConstantFoldConstantExpression(CE, DL, TLI);
         if (New && New != CE)
           GV->setInitializer(New);
@@ -1971,9 +1964,8 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
 
 static inline bool
 isSimpleEnoughValueToCommit(Constant *C,
-                            SmallPtrSetImpl<Constant*> &SimpleConstants,
-                            const DataLayout *DL);
-
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL);
 
 /// isSimpleEnoughValueToCommit - Return true if the specified constant can be
 /// handled by the code generator.  We don't want to generate something like:
@@ -1983,9 +1975,10 @@ isSimpleEnoughValueToCommit(Constant *C,
 /// This function should be called if C was not found (but just got inserted)
 /// in SimpleConstants to avoid having to rescan the same constants all the
 /// time.
-static bool isSimpleEnoughValueToCommitHelper(Constant *C,
-                                   SmallPtrSetImpl<Constant*> &SimpleConstants,
-                                   const DataLayout *DL) {
+static bool
+isSimpleEnoughValueToCommitHelper(Constant *C,
+                                  SmallPtrSetImpl<Constant *> &SimpleConstants,
+                                  const DataLayout &DL) {
   // Simple global addresses are supported, do not allow dllimport or
   // thread-local globals.
   if (auto *GV = dyn_cast<GlobalValue>(C))
@@ -2019,8 +2012,8 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
   case Instruction::PtrToInt:
     // int <=> ptr is fine if the int type is the same size as the
     // pointer type.
-    if (!DL || DL->getTypeSizeInBits(CE->getType()) !=
-               DL->getTypeSizeInBits(CE->getOperand(0)->getType()))
+    if (DL.getTypeSizeInBits(CE->getType()) !=
+        DL.getTypeSizeInBits(CE->getOperand(0)->getType()))
       return false;
     return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
 
@@ -2042,8 +2035,8 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
 
 static inline bool
 isSimpleEnoughValueToCommit(Constant *C,
-                            SmallPtrSetImpl<Constant*> &SimpleConstants,
-                            const DataLayout *DL) {
+                            SmallPtrSetImpl<Constant *> &SimpleConstants,
+                            const DataLayout &DL) {
   // If we already checked this constant, we win.
   if (!SimpleConstants.insert(C).second)
     return true;
@@ -2174,8 +2167,8 @@ namespace {
 /// Once an evaluation call fails, the evaluation object should not be reused.
 class Evaluator {
 public:
-  Evaluator(const DataLayout *DL, const TargetLibraryInfo *TLI)
-    : DL(DL), TLI(TLI) {
+  Evaluator(const DataLayout &DL, const TargetLibraryInfo *TLI)
+      : DL(DL), TLI(TLI) {
     ValueStack.emplace_back();
   }
 
@@ -2249,7 +2242,7 @@ private:
   /// simple enough to live in a static initializer of a global.
   SmallPtrSet<Constant*, 8> SimpleConstants;
 
-  const DataLayout *DL;
+  const DataLayout &DL;
   const TargetLibraryInfo *TLI;
 };
 
@@ -2498,9 +2491,9 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           Value *Ptr = PtrArg->stripPointerCasts();
           if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
             Type *ElemTy = cast<PointerType>(GV->getType())->getElementType();
-            if (DL && !Size->isAllOnesValue() &&
+            if (!Size->isAllOnesValue() &&
                 Size->getValue().getLimitedValue() >=
-                DL->getTypeStoreSize(ElemTy)) {
+                    DL.getTypeStoreSize(ElemTy)) {
               Invariants.insert(GV);
               DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
                     << "\n");
@@ -2689,7 +2682,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
 
 /// EvaluateStaticConstructor - Evaluate static constructors in the function, if
 /// we can.  Return true if we can, false otherwise.
-static bool EvaluateStaticConstructor(Function *F, const DataLayout *DL,
+static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
                                       const TargetLibraryInfo *TLI) {
   // Call the function.
   Evaluator Eval(DL, TLI);
@@ -3040,8 +3033,7 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
 bool GlobalOpt::runOnModule(Module &M) {
   bool Changed = false;
 
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
+  auto &DL = M.getDataLayout();
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   bool LocalChange = true;
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 305ad7a..3aa4ee5 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -29,7 +30,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -72,8 +72,8 @@ Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime)
                                           InlineLimit : Threshold),
     InsertLifetime(InsertLifetime) {}
 
-/// getAnalysisUsage - For this class, we declare that we require and preserve
-/// the call graph.  If the derived class implements this method, it should
+/// For this class, we declare that we require and preserve the call graph.
+/// If the derived class implements this method, it should
 /// always explicitly call the implementation here.
 void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AliasAnalysis>();
@@ -111,18 +111,17 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {
     Caller->addFnAttr(Attribute::StackProtect);
 }
 
-/// InlineCallIfPossible - If it is possible to inline the specified call site,
+/// If it is possible to inline the specified call site,
 /// do so and update the CallGraph for this operation.
 ///
 /// This function also does some basic book-keeping to update the IR.  The
 /// InlinedArrayAllocas map keeps track of any allocas that are already
-/// available from other  functions inlined into the caller.  If we are able to
+/// available from other functions inlined into the caller.  If we are able to
 /// inline this call site we attempt to reuse already available allocas or add
 /// any new allocas to the set if not possible.
 static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
                                  InlinedArrayAllocasTy &InlinedArrayAllocas,
-                                 int InlineHistory, bool InsertLifetime,
-                                 const DataLayout *DL) {
+                                 int InlineHistory, bool InsertLifetime) {
   Function *Callee = CS.getCalledFunction();
   Function *Caller = CS.getCaller();
 
@@ -198,11 +197,6 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
 
       unsigned Align1 = AI->getAlignment(),
                Align2 = AvailableAlloca->getAlignment();
-      // If we don't have data layout information, and only one alloca is using
-      // the target default, then we can't safely merge them because we can't
-      // pick the greater alignment.
-      if (!DL && (!Align1 || !Align2) && Align1 != Align2)
-        continue;
       
       // The available alloca has to be in the right function, not in some other
       // function in this SCC.
@@ -223,8 +217,8 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
 
       if (Align1 != Align2) {
         if (!Align1 || !Align2) {
-          assert(DL && "DataLayout required to compare default alignments");
-          unsigned TypeAlign = DL->getABITypeAlignment(AI->getAllocatedType());
+          const DataLayout &DL = Caller->getParent()->getDataLayout();
+          unsigned TypeAlign = DL.getABITypeAlignment(AI->getAllocatedType());
 
           Align1 = Align1 ? Align1 : TypeAlign;
           Align2 = Align2 ? Align2 : TypeAlign;
@@ -300,8 +294,7 @@ static void emitAnalysis(CallSite CS, const Twine &Msg) {
   emitOptimizationRemarkAnalysis(Ctx, DEBUG_TYPE, *Caller, DLoc, Msg);
 }
 
-/// shouldInline - Return true if the inliner should attempt to inline
-/// at the given CallSite.
+/// Return true if the inliner should attempt to inline at the given CallSite.
 bool Inliner::shouldInline(CallSite CS) {
   InlineCost IC = getInlineCost(CS);
   
@@ -415,7 +408,7 @@ bool Inliner::shouldInline(CallSite CS) {
   return true;
 }
 
-/// InlineHistoryIncludes - Return true if the specified inline history ID
+/// Return true if the specified inline history ID
 /// indicates an inline history that includes the specified function.
 static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
             const SmallVectorImpl<std::pair<Function*, int> > &InlineHistory) {
@@ -432,8 +425,6 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr;
   AliasAnalysis *AA = &getAnalysis<AliasAnalysis>();
@@ -495,7 +486,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
 
   
   InlinedArrayAllocasTy InlinedArrayAllocas;
-  InlineFunctionInfo InlineInfo(&CG, DL, AA, ACT);
+  InlineFunctionInfo InlineInfo(&CG, AA, ACT);
 
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
@@ -553,7 +544,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
 
         // Attempt to inline the function.
         if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
-                                  InlineHistoryID, InsertLifetime, DL)) {
+                                  InlineHistoryID, InsertLifetime)) {
           emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc,
                                        Twine(Callee->getName() +
                                              " will not be inlined into " +
@@ -625,14 +616,13 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   return Changed;
 }
 
-// doFinalization - Remove now-dead linkonce functions at the end of
-// processing to avoid breaking the SCC traversal.
+/// Remove now-dead linkonce functions at the end of
+/// processing to avoid breaking the SCC traversal.
 bool Inliner::doFinalization(CallGraph &CG) {
   return removeDeadFunctions(CG);
 }
 
-/// removeDeadFunctions - Remove dead functions that are not included in
-/// DNR (Do Not Remove) list.
+/// Remove dead functions that are not included in DNR (Do Not Remove) list.
 bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
   SmallVector<CallGraphNode*, 16> FunctionsToRemove;
 
diff --git a/lib/Transforms/IPO/LowerBitSets.cpp b/lib/Transforms/IPO/LowerBitSets.cpp
index 0a22a80..fe00d92 100644
--- a/lib/Transforms/IPO/LowerBitSets.cpp
+++ b/lib/Transforms/IPO/LowerBitSets.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -31,10 +32,17 @@ using namespace llvm;
 
 #define DEBUG_TYPE "lowerbitsets"
 
-STATISTIC(NumBitSetsCreated, "Number of bitsets created");
+STATISTIC(ByteArraySizeBits, "Byte array size in bits");
+STATISTIC(ByteArraySizeBytes, "Byte array size in bytes");
+STATISTIC(NumByteArraysCreated, "Number of byte arrays created");
 STATISTIC(NumBitSetCallsLowered, "Number of bitset calls lowered");
 STATISTIC(NumBitSetDisjointSets, "Number of disjoint sets of bitsets");
 
+static cl::opt<bool> AvoidReuse(
+    "lowerbitsets-avoid-reuse",
+    cl::desc("Try to avoid reuse of byte array addresses using aliases"),
+    cl::Hidden, cl::init(true));
+
 bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
   if (Offset < ByteOffset)
     return false;
@@ -46,11 +54,11 @@ bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
   if (BitOffset >= BitSize)
     return false;
 
-  return (Bits[BitOffset / 8] >> (BitOffset % 8)) & 1;
+  return Bits.count(BitOffset);
 }
 
 bool BitSetInfo::containsValue(
-    const DataLayout *DL,
+    const DataLayout &DL,
     const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout, Value *V,
     uint64_t COffset) const {
   if (auto GV = dyn_cast<GlobalVariable>(V)) {
@@ -61,8 +69,8 @@ bool BitSetInfo::containsValue(
   }
 
   if (auto GEP = dyn_cast<GEPOperator>(V)) {
-    APInt APOffset(DL->getPointerSizeInBits(0), 0);
-    bool Result = GEP->accumulateConstantOffset(*DL, APOffset);
+    APInt APOffset(DL.getPointerSizeInBits(0), 0);
+    bool Result = GEP->accumulateConstantOffset(DL, APOffset);
     if (!Result)
       return false;
     COffset += APOffset.getZExtValue();
@@ -101,18 +109,15 @@ BitSetInfo BitSetBuilder::build() {
   BSI.ByteOffset = Min;
 
   BSI.AlignLog2 = 0;
-  // FIXME: Can probably do something smarter if all offsets are 0.
   if (Mask != 0)
     BSI.AlignLog2 = countTrailingZeros(Mask, ZB_Undefined);
 
   // Build the compressed bitset while normalizing the offsets against the
   // computed alignment.
   BSI.BitSize = ((Max - Min) >> BSI.AlignLog2) + 1;
-  uint64_t ByteSize = (BSI.BitSize + 7) / 8;
-  BSI.Bits.resize(ByteSize);
   for (uint64_t Offset : Offsets) {
     Offset >>= BSI.AlignLog2;
-    BSI.Bits[Offset / 8] |= 1 << (Offset % 8);
+    BSI.Bits.insert(Offset);
   }
 
   return BSI;
@@ -147,15 +152,47 @@ void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) {
     FragmentMap[ObjIndex] = FragmentIndex;
 }
 
+void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits,
+                                uint64_t BitSize, uint64_t &AllocByteOffset,
+                                uint8_t &AllocMask) {
+  // Find the smallest current allocation.
+  unsigned Bit = 0;
+  for (unsigned I = 1; I != BitsPerByte; ++I)
+    if (BitAllocs[I] < BitAllocs[Bit])
+      Bit = I;
+
+  AllocByteOffset = BitAllocs[Bit];
+
+  // Add our size to it.
+  unsigned ReqSize = AllocByteOffset + BitSize;
+  BitAllocs[Bit] = ReqSize;
+  if (Bytes.size() < ReqSize)
+    Bytes.resize(ReqSize);
+
+  // Set our bits.
+  AllocMask = 1 << Bit;
+  for (uint64_t B : Bits)
+    Bytes[AllocByteOffset + B] |= AllocMask;
+}
+
 namespace {
 
+struct ByteArrayInfo {
+  std::set<uint64_t> Bits;
+  uint64_t BitSize;
+  GlobalVariable *ByteArray;
+  Constant *Mask;
+};
+
 struct LowerBitSets : public ModulePass {
   static char ID;
   LowerBitSets() : ModulePass(ID) {
     initializeLowerBitSetsPass(*PassRegistry::getPassRegistry());
   }
 
-  const DataLayout *DL;
+  Module *M;
+
+  bool LinkerSubsectionsViaSymbols;
   IntegerType *Int1Ty;
   IntegerType *Int8Ty;
   IntegerType *Int32Ty;
@@ -169,20 +206,23 @@ struct LowerBitSets : public ModulePass {
   // Mapping from bitset mdstrings to the call sites that test them.
   DenseMap<MDString *, std::vector<CallInst *>> BitSetTestCallSites;
 
+  std::vector<ByteArrayInfo> ByteArrayInfos;
+
   BitSetInfo
   buildBitSet(MDString *BitSet,
               const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout);
-  Value *createBitSetTest(IRBuilder<> &B, const BitSetInfo &BSI,
-                          GlobalVariable *BitSetGlobal, Value *BitOffset);
+  ByteArrayInfo *createByteArray(BitSetInfo &BSI);
+  void allocateByteArrays();
+  Value *createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI, ByteArrayInfo *&BAI,
+                          Value *BitOffset);
   Value *
-  lowerBitSetCall(CallInst *CI, const BitSetInfo &BSI,
-                  GlobalVariable *BitSetGlobal, GlobalVariable *CombinedGlobal,
+  lowerBitSetCall(CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
+                  GlobalVariable *CombinedGlobal,
                   const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout);
-  void buildBitSetsFromGlobals(Module &M,
-                               const std::vector<MDString *> &BitSets,
+  void buildBitSetsFromGlobals(const std::vector<MDString *> &BitSets,
                                const std::vector<GlobalVariable *> &Globals);
-  bool buildBitSets(Module &M);
-  bool eraseBitSetMetadata(Module &M);
+  bool buildBitSets();
+  bool eraseBitSetMetadata();
 
   bool doInitialization(Module &M) override;
   bool runOnModule(Module &M) override;
@@ -198,19 +238,21 @@ char LowerBitSets::ID = 0;
 
 ModulePass *llvm::createLowerBitSetsPass() { return new LowerBitSets; }
 
-bool LowerBitSets::doInitialization(Module &M) {
-  DL = M.getDataLayout();
-  if (!DL)
-    report_fatal_error("Data layout required");
+bool LowerBitSets::doInitialization(Module &Mod) {
+  M = &Mod;
+  const DataLayout &DL = Mod.getDataLayout();
 
-  Int1Ty = Type::getInt1Ty(M.getContext());
-  Int8Ty = Type::getInt8Ty(M.getContext());
-  Int32Ty = Type::getInt32Ty(M.getContext());
+  Triple TargetTriple(M->getTargetTriple());
+  LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
+
+  Int1Ty = Type::getInt1Ty(M->getContext());
+  Int8Ty = Type::getInt8Ty(M->getContext());
+  Int32Ty = Type::getInt32Ty(M->getContext());
   Int32PtrTy = PointerType::getUnqual(Int32Ty);
-  Int64Ty = Type::getInt64Ty(M.getContext());
-  IntPtrTy = DL->getIntPtrType(M.getContext(), 0);
+  Int64Ty = Type::getInt64Ty(M->getContext());
+  IntPtrTy = DL.getIntPtrType(M->getContext(), 0);
 
-  BitSetNM = M.getNamedMetadata("llvm.bitsets");
+  BitSetNM = M->getNamedMetadata("llvm.bitsets");
 
   BitSetTestCallSites.clear();
 
@@ -259,52 +301,128 @@ static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
   return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
 }
 
+ByteArrayInfo *LowerBitSets::createByteArray(BitSetInfo &BSI) {
+  // Create globals to stand in for byte arrays and masks. These never actually
+  // get initialized, we RAUW and erase them later in allocateByteArrays() once
+  // we know the offset and mask to use.
+  auto ByteArrayGlobal = new GlobalVariable(
+      *M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr);
+  auto MaskGlobal = new GlobalVariable(
+      *M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr);
+
+  ByteArrayInfos.emplace_back();
+  ByteArrayInfo *BAI = &ByteArrayInfos.back();
+
+  BAI->Bits = BSI.Bits;
+  BAI->BitSize = BSI.BitSize;
+  BAI->ByteArray = ByteArrayGlobal;
+  BAI->Mask = ConstantExpr::getPtrToInt(MaskGlobal, Int8Ty);
+  return BAI;
+}
+
+void LowerBitSets::allocateByteArrays() {
+  std::stable_sort(ByteArrayInfos.begin(), ByteArrayInfos.end(),
+                   [](const ByteArrayInfo &BAI1, const ByteArrayInfo &BAI2) {
+                     return BAI1.BitSize > BAI2.BitSize;
+                   });
+
+  std::vector<uint64_t> ByteArrayOffsets(ByteArrayInfos.size());
+
+  ByteArrayBuilder BAB;
+  for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
+    ByteArrayInfo *BAI = &ByteArrayInfos[I];
+
+    uint8_t Mask;
+    BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask);
+
+    BAI->Mask->replaceAllUsesWith(ConstantInt::get(Int8Ty, Mask));
+    cast<GlobalVariable>(BAI->Mask->getOperand(0))->eraseFromParent();
+  }
+
+  Constant *ByteArrayConst = ConstantDataArray::get(M->getContext(), BAB.Bytes);
+  auto ByteArray =
+      new GlobalVariable(*M, ByteArrayConst->getType(), /*isConstant=*/true,
+                         GlobalValue::PrivateLinkage, ByteArrayConst);
+
+  for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
+    ByteArrayInfo *BAI = &ByteArrayInfos[I];
+
+    Constant *Idxs[] = {ConstantInt::get(IntPtrTy, 0),
+                        ConstantInt::get(IntPtrTy, ByteArrayOffsets[I])};
+    Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(ByteArray, Idxs);
+
+    // Create an alias instead of RAUW'ing the gep directly. On x86 this ensures
+    // that the pc-relative displacement is folded into the lea instead of the
+    // test instruction getting another displacement.
+    if (LinkerSubsectionsViaSymbols) {
+      BAI->ByteArray->replaceAllUsesWith(GEP);
+    } else {
+      GlobalAlias *Alias = GlobalAlias::create(
+          Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, M);
+      BAI->ByteArray->replaceAllUsesWith(Alias);
+    }
+    BAI->ByteArray->eraseFromParent();
+  }
+
+  ByteArraySizeBits = BAB.BitAllocs[0] + BAB.BitAllocs[1] + BAB.BitAllocs[2] +
+                      BAB.BitAllocs[3] + BAB.BitAllocs[4] + BAB.BitAllocs[5] +
+                      BAB.BitAllocs[6] + BAB.BitAllocs[7];
+  ByteArraySizeBytes = BAB.Bytes.size();
+}
+
 /// Build a test that bit BitOffset is set in BSI, where
 /// BitSetGlobal is a global containing the bits in BSI.
-Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, const BitSetInfo &BSI,
-                                      GlobalVariable *BitSetGlobal,
-                                      Value *BitOffset) {
-  if (BSI.Bits.size() <= 8) {
+Value *LowerBitSets::createBitSetTest(IRBuilder<> &B, BitSetInfo &BSI,
+                                      ByteArrayInfo *&BAI, Value *BitOffset) {
+  if (BSI.BitSize <= 64) {
     // If the bit set is sufficiently small, we can avoid a load by bit testing
     // a constant.
     IntegerType *BitsTy;
-    if (BSI.Bits.size() <= 4)
+    if (BSI.BitSize <= 32)
       BitsTy = Int32Ty;
     else
       BitsTy = Int64Ty;
 
     uint64_t Bits = 0;
-    for (auto I = BSI.Bits.rbegin(), E = BSI.Bits.rend(); I != E; ++I) {
-      Bits <<= 8;
-      Bits |= *I;
-    }
+    for (auto Bit : BSI.Bits)
+      Bits |= uint64_t(1) << Bit;
     Constant *BitsConst = ConstantInt::get(BitsTy, Bits);
     return createMaskedBitTest(B, BitsConst, BitOffset);
   } else {
-    // TODO: We might want to use the memory variant of the bt instruction
-    // with the previously computed bit offset at -Os. This instruction does
-    // exactly what we want but has been benchmarked as being slower than open
-    // coding the load+bt.
-    Value *BitSetGlobalOffset =
-        B.CreateLShr(BitOffset, ConstantInt::get(IntPtrTy, 5));
-    Value *BitSetEntryAddr = B.CreateGEP(
-        ConstantExpr::getBitCast(BitSetGlobal, Int32PtrTy), BitSetGlobalOffset);
-    Value *BitSetEntry = B.CreateLoad(BitSetEntryAddr);
-
-    return createMaskedBitTest(B, BitSetEntry, BitOffset);
+    if (!BAI) {
+      ++NumByteArraysCreated;
+      BAI = createByteArray(BSI);
+    }
+
+    Constant *ByteArray = BAI->ByteArray;
+    if (!LinkerSubsectionsViaSymbols && AvoidReuse) {
+      // Each use of the byte array uses a different alias. This makes the
+      // backend less likely to reuse previously computed byte array addresses,
+      // improving the security of the CFI mechanism based on this pass.
+      ByteArray = GlobalAlias::create(
+          BAI->ByteArray->getType()->getElementType(), 0,
+          GlobalValue::PrivateLinkage, "bits_use", ByteArray, M);
+    }
+
+    Value *ByteAddr = B.CreateGEP(ByteArray, BitOffset);
+    Value *Byte = B.CreateLoad(ByteAddr);
+
+    Value *ByteAndMask = B.CreateAnd(Byte, BAI->Mask);
+    return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0));
   }
 }
 
 /// Lower a llvm.bitset.test call to its implementation. Returns the value to
 /// replace the call with.
 Value *LowerBitSets::lowerBitSetCall(
-    CallInst *CI, const BitSetInfo &BSI, GlobalVariable *BitSetGlobal,
+    CallInst *CI, BitSetInfo &BSI, ByteArrayInfo *&BAI,
     GlobalVariable *CombinedGlobal,
     const DenseMap<GlobalVariable *, uint64_t> &GlobalLayout) {
   Value *Ptr = CI->getArgOperand(0);
+  const DataLayout &DL = M->getDataLayout();
 
   if (BSI.containsValue(DL, GlobalLayout, Ptr))
-    return ConstantInt::getTrue(BitSetGlobal->getParent()->getContext());
+    return ConstantInt::getTrue(CombinedGlobal->getParent()->getContext());
 
   Constant *GlobalAsInt = ConstantExpr::getPtrToInt(CombinedGlobal, IntPtrTy);
   Constant *OffsetedGlobalAsInt = ConstantExpr::getAdd(
@@ -336,8 +454,8 @@ Value *LowerBitSets::lowerBitSetCall(
     Value *OffsetSHR =
         B.CreateLShr(PtrOffset, ConstantInt::get(IntPtrTy, BSI.AlignLog2));
     Value *OffsetSHL = B.CreateShl(
-        PtrOffset, ConstantInt::get(IntPtrTy, DL->getPointerSizeInBits(0) -
-                                                  BSI.AlignLog2));
+        PtrOffset,
+        ConstantInt::get(IntPtrTy, DL.getPointerSizeInBits(0) - BSI.AlignLog2));
     BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
   }
 
@@ -353,7 +471,7 @@ Value *LowerBitSets::lowerBitSetCall(
 
   // Now that we know that the offset is in range and aligned, load the
   // appropriate bit from the bitset.
-  Value *Bit = createBitSetTest(ThenB, BSI, BitSetGlobal, BitOffset);
+  Value *Bit = createBitSetTest(ThenB, BSI, BAI, BitOffset);
 
   // The value we want is 0 if we came directly from the initial block
   // (having failed the range or alignment checks), or the loaded bit if
@@ -368,14 +486,14 @@ Value *LowerBitSets::lowerBitSetCall(
 /// Given a disjoint set of bitsets and globals, layout the globals, build the
 /// bit sets and lower the llvm.bitset.test calls.
 void LowerBitSets::buildBitSetsFromGlobals(
-    Module &M,
     const std::vector<MDString *> &BitSets,
     const std::vector<GlobalVariable *> &Globals) {
   // Build a new global with the combined contents of the referenced globals.
   std::vector<Constant *> GlobalInits;
+  const DataLayout &DL = M->getDataLayout();
   for (GlobalVariable *G : Globals) {
     GlobalInits.push_back(G->getInitializer());
-    uint64_t InitSize = DL->getTypeAllocSize(G->getInitializer()->getType());
+    uint64_t InitSize = DL.getTypeAllocSize(G->getInitializer()->getType());
 
     // Compute the amount of padding required to align the next element to the
     // next power of 2.
@@ -391,13 +509,13 @@ void LowerBitSets::buildBitSetsFromGlobals(
   }
   if (!GlobalInits.empty())
     GlobalInits.pop_back();
-  Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
+  Constant *NewInit = ConstantStruct::getAnon(M->getContext(), GlobalInits);
   auto CombinedGlobal =
-      new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
+      new GlobalVariable(*M, NewInit->getType(), /*isConstant=*/true,
                          GlobalValue::PrivateLinkage, NewInit);
 
   const StructLayout *CombinedGlobalLayout =
-      DL->getStructLayout(cast<StructType>(NewInit->getType()));
+      DL.getStructLayout(cast<StructType>(NewInit->getType()));
 
   // Compute the offsets of the original globals within the new global.
   DenseMap<GlobalVariable *, uint64_t> GlobalLayout;
@@ -410,18 +528,12 @@ void LowerBitSets::buildBitSetsFromGlobals(
     // Build the bitset.
     BitSetInfo BSI = buildBitSet(BS, GlobalLayout);
 
-    // Create a global in which to store it.
-    ++NumBitSetsCreated;
-    Constant *BitsConst = ConstantDataArray::get(M.getContext(), BSI.Bits);
-    auto BitSetGlobal = new GlobalVariable(
-        M, BitsConst->getType(), /*isConstant=*/true,
-        GlobalValue::PrivateLinkage, BitsConst, BS->getString() + ".bits");
+    ByteArrayInfo *BAI = 0;
 
     // Lower each call to llvm.bitset.test for this bitset.
     for (CallInst *CI : BitSetTestCallSites[BS]) {
       ++NumBitSetCallsLowered;
-      Value *Lowered =
-          lowerBitSetCall(CI, BSI, BitSetGlobal, CombinedGlobal, GlobalLayout);
+      Value *Lowered = lowerBitSetCall(CI, BSI, BAI, CombinedGlobal, GlobalLayout);
       CI->replaceAllUsesWith(Lowered);
       CI->eraseFromParent();
     }
@@ -436,20 +548,24 @@ void LowerBitSets::buildBitSetsFromGlobals(
                                       ConstantInt::get(Int32Ty, I * 2)};
     Constant *CombinedGlobalElemPtr =
         ConstantExpr::getGetElementPtr(CombinedGlobal, CombinedGlobalIdxs);
-    GlobalAlias *GAlias = GlobalAlias::create(
-        Globals[I]->getType()->getElementType(),
-        Globals[I]->getType()->getAddressSpace(), Globals[I]->getLinkage(),
-        "", CombinedGlobalElemPtr, &M);
-    GAlias->takeName(Globals[I]);
-    Globals[I]->replaceAllUsesWith(GAlias);
+    if (LinkerSubsectionsViaSymbols) {
+      Globals[I]->replaceAllUsesWith(CombinedGlobalElemPtr);
+    } else {
+      GlobalAlias *GAlias = GlobalAlias::create(
+          Globals[I]->getType()->getElementType(),
+          Globals[I]->getType()->getAddressSpace(), Globals[I]->getLinkage(),
+          "", CombinedGlobalElemPtr, M);
+      GAlias->takeName(Globals[I]);
+      Globals[I]->replaceAllUsesWith(GAlias);
+    }
     Globals[I]->eraseFromParent();
   }
 }
 
 /// Lower all bit sets in this module.
-bool LowerBitSets::buildBitSets(Module &M) {
+bool LowerBitSets::buildBitSets() {
   Function *BitSetTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::bitset_test));
+      M->getFunction(Intrinsic::getName(Intrinsic::bitset_test));
   if (!BitSetTestFunc)
     return false;
 
@@ -591,22 +707,24 @@ bool LowerBitSets::buildBitSets(Module &M) {
     });
 
     // Build the bitsets from this disjoint set.
-    buildBitSetsFromGlobals(M, BitSets, OrderedGlobals);
+    buildBitSetsFromGlobals(BitSets, OrderedGlobals);
   }
 
+  allocateByteArrays();
+
   return true;
 }
 
-bool LowerBitSets::eraseBitSetMetadata(Module &M) {
+bool LowerBitSets::eraseBitSetMetadata() {
   if (!BitSetNM)
     return false;
 
-  M.eraseNamedMetadata(BitSetNM);
+  M->eraseNamedMetadata(BitSetNM);
   return true;
 }
 
 bool LowerBitSets::runOnModule(Module &M) {
-  bool Changed = buildBitSets(M);
-  Changed |= eraseBitSetMetadata(M);
+  bool Changed = buildBitSets();
+  Changed |= eraseBitSetMetadata();
   return Changed;
 }
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index b91ebf2..596674d 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -127,9 +127,8 @@ namespace {
 /// side of claiming that two functions are different).
 class FunctionComparator {
 public:
-  FunctionComparator(const DataLayout *DL, const Function *F1,
-                     const Function *F2)
-      : FnL(F1), FnR(F2), DL(DL) {}
+  FunctionComparator(const Function *F1, const Function *F2)
+      : FnL(F1), FnR(F2) {}
 
   /// Test whether the two functions have equivalent behaviour.
   int compare();
@@ -292,8 +291,7 @@ private:
   /// Parts to be compared for each comparison stage,
   /// most significant stage first:
   /// 1. Address space. As numbers.
-  /// 2. Constant offset, (if "DataLayout *DL" field is not NULL,
-  /// using GEPOperator::accumulateConstantOffset method).
+  /// 2. Constant offset, (using GEPOperator::accumulateConstantOffset method).
   /// 3. Pointer operand type (using cmpType method).
   /// 4. Number of operands.
   /// 5. Compare operands, using cmpValues method.
@@ -354,8 +352,6 @@ private:
   // The two functions undergoing comparison.
   const Function *FnL, *FnR;
 
-  const DataLayout *DL;
-
   /// Assign serial numbers to values from left function, and values from
   /// right function.
   /// Explanation:
@@ -394,14 +390,13 @@ private:
 
 class FunctionNode {
   AssertingVH<Function> F;
-  const DataLayout *DL;
 
 public:
-  FunctionNode(Function *F, const DataLayout *DL) : F(F), DL(DL) {}
+  FunctionNode(Function *F) : F(F) {}
   Function *getFunc() const { return F; }
   void release() { F = 0; }
   bool operator<(const FunctionNode &RHS) const {
-    return (FunctionComparator(DL, F, RHS.getFunc()).compare()) == -1;
+    return (FunctionComparator(F, RHS.getFunc()).compare()) == -1;
   }
 };
 }
@@ -620,10 +615,11 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
   PointerType *PTyL = dyn_cast<PointerType>(TyL);
   PointerType *PTyR = dyn_cast<PointerType>(TyR);
 
-  if (DL) {
-    if (PTyL && PTyL->getAddressSpace() == 0) TyL = DL->getIntPtrType(TyL);
-    if (PTyR && PTyR->getAddressSpace() == 0) TyR = DL->getIntPtrType(TyR);
-  }
+  const DataLayout &DL = FnL->getParent()->getDataLayout();
+  if (PTyL && PTyL->getAddressSpace() == 0)
+    TyL = DL.getIntPtrType(TyL);
+  if (PTyR && PTyR->getAddressSpace() == 0)
+    TyR = DL.getIntPtrType(TyR);
 
   if (TyL == TyR)
     return 0;
@@ -855,13 +851,12 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
 
   // When we have target data, we can reduce the GEP down to the value in bytes
   // added to the address.
-  if (DL) {
-    unsigned BitWidth = DL->getPointerSizeInBits(ASL);
-    APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
-    if (GEPL->accumulateConstantOffset(*DL, OffsetL) &&
-        GEPR->accumulateConstantOffset(*DL, OffsetR))
-      return cmpAPInts(OffsetL, OffsetR);
-  }
+  const DataLayout &DL = FnL->getParent()->getDataLayout();
+  unsigned BitWidth = DL.getPointerSizeInBits(ASL);
+  APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
+  if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
+      GEPR->accumulateConstantOffset(DL, OffsetR))
+    return cmpAPInts(OffsetL, OffsetR);
 
   if (int Res = cmpNumbers((uint64_t)GEPL->getPointerOperand()->getType(),
                            (uint64_t)GEPR->getPointerOperand()->getType()))
@@ -1122,9 +1117,6 @@ private:
   /// to modify it.
   FnTreeType FnTree;
 
-  /// DataLayout for more accurate GEP comparisons. May be NULL.
-  const DataLayout *DL;
-
   /// Whether or not the target supports global aliases.
   bool HasGlobalAliases;
 };
@@ -1152,8 +1144,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
       for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) {
         Function *F1 = cast<Function>(*I);
         Function *F2 = cast<Function>(*J);
-        int Res1 = FunctionComparator(DL, F1, F2).compare();
-        int Res2 = FunctionComparator(DL, F2, F1).compare();
+        int Res1 = FunctionComparator(F1, F2).compare();
+        int Res2 = FunctionComparator(F2, F1).compare();
 
         // If F1 <= F2, then F2 >= F1, otherwise report failure.
         if (Res1 != -Res2) {
@@ -1174,8 +1166,8 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
             continue;
 
           Function *F3 = cast<Function>(*K);
-          int Res3 = FunctionComparator(DL, F1, F3).compare();
-          int Res4 = FunctionComparator(DL, F2, F3).compare();
+          int Res3 = FunctionComparator(F1, F3).compare();
+          int Res4 = FunctionComparator(F2, F3).compare();
 
           bool Transitive = true;
 
@@ -1212,8 +1204,6 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
 
 bool MergeFunctions::runOnModule(Module &M) {
   bool Changed = false;
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage())
@@ -1420,7 +1410,7 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
 // that was already inserted.
 bool MergeFunctions::insert(Function *NewFunction) {
   std::pair<FnTreeType::iterator, bool> Result =
-      FnTree.insert(FunctionNode(NewFunction, DL));
+      FnTree.insert(FunctionNode(NewFunction));
 
   if (Result.second) {
     DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n');
@@ -1457,7 +1447,7 @@ bool MergeFunctions::insert(Function *NewFunction) {
 void MergeFunctions::remove(Function *F) {
   // We need to make sure we remove F, not a function "equal" to F per the
   // function equality comparator.
-  FnTreeType::iterator found = FnTree.find(FunctionNode(F, DL));
+  FnTreeType::iterator found = FnTree.find(FunctionNode(F));
   size_t Erased = 0;
   if (found != FnTree.end() && found->getFunc() == F) {
     Erased = 1;
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 9a75050..d28d563 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -77,6 +77,10 @@ static cl::opt<bool>
 EnableMLSM("mlsm", cl::init(true), cl::Hidden,
            cl::desc("Enable motion of merged load and store"));
 
+static cl::opt<bool> EnableLoopInterchange(
+    "enable-loopinterchange", cl::init(false), cl::Hidden,
+    cl::desc("Enable the new, experimental LoopInterchange Pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -93,7 +97,6 @@ PassManagerBuilder::PassManagerBuilder() {
     DisableGVNLoadPRE = false;
     VerifyInput = false;
     VerifyOutput = false;
-    StripDebug = false;
     MergeFunctions = false;
 }
 
@@ -239,6 +242,8 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
+  if (EnableLoopInterchange)
+    MPM.add(createLoopInterchangePass()); // Interchange loops
 
   if (!DisableUnrollLoops)
     MPM.add(createSimpleLoopUnrollPass());    // Unroll small loops
@@ -305,8 +310,7 @@ void PassManagerBuilder::populateModulePassManager(
   // Re-rotate loops in all our loop nests. These may have fallout out of
   // rotated form due to GVN or other transformations, and the vectorizer relies
   // on the rotated form.
-  if (ExtraVectorizerPasses)
-    MPM.add(createLoopRotatePass());
+  MPM.add(createLoopRotatePass());
 
   MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
   // FIXME: Because of #pragma vectorize enable, the passes below are always
@@ -358,9 +362,20 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createCFGSimplificationPass());
   MPM.add(createInstructionCombiningPass());
 
-  if (!DisableUnrollLoops)
+  if (!DisableUnrollLoops) {
     MPM.add(createLoopUnrollPass());    // Unroll small loops
 
+    // This is a barrier pass to avoid combine LICM pass and loop unroll pass
+    // within same loop pass manager.
+    MPM.add(createInstructionSimplifierPass());
+
+    // Runtime unrolling will introduce runtime check in loop prologue. If the
+    // unrolled loop is a inner loop, then the prologue will be inside the
+    // outer loop. LICM pass can help to promote the runtime check out if the
+    // checked value is loop invariant.
+    MPM.add(createLICMPass());
+  }
+
   // After vectorization and unrolling, assume intrinsics may tell us more
   // about pointer alignments.
   MPM.add(createAlignmentFromAssumptionsPass());
@@ -454,6 +469,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // More loops are countable; try to optimize them.
   PM.add(createIndVarSimplifyPass());
   PM.add(createLoopDeletionPass());
+  if (EnableLoopInterchange)
+    PM.add(createLoopInterchangePass());
+
   PM.add(createLoopVectorizePass(true, LoopVectorize));
 
   // More scalar chains could be vectorized due to more alias information
@@ -473,10 +491,10 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   addExtensionsToPM(EP_Peephole, PM);
 
   PM.add(createJumpThreadingPass());
+}
 
-  // Lower bitset metadata to bitsets.
-  PM.add(createLowerBitSetsPass());
-
+void PassManagerBuilder::addLateLTOOptimizationPasses(
+    legacy::PassManagerBase &PM) {
   // Delete basic blocks, which optimization passes may have killed.
   PM.add(createCFGSimplificationPass());
 
@@ -496,19 +514,19 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   if (VerifyInput)
     PM.add(createVerifierPass());
 
-  if (StripDebug)
-    PM.add(createStripSymbolsPass(true));
+  if (OptLevel > 1)
+    addLTOOptimizationPasses(PM);
 
-  if (VerifyInput)
-    PM.add(createDebugInfoVerifierPass());
+  // Lower bit sets to globals. This pass supports Clang's control flow
+  // integrity mechanisms (-fsanitize=cfi*) and needs to run at link time if CFI
+  // is enabled. The pass does nothing if CFI is disabled.
+  PM.add(createLowerBitSetsPass());
 
   if (OptLevel != 0)
-    addLTOOptimizationPasses(PM);
+    addLateLTOOptimizationPasses(PM);
 
-  if (VerifyOutput) {
+  if (VerifyOutput)
     PM.add(createVerifierPass());
-    PM.add(createDebugInfoVerifierPass());
-  }
 }
 
 inline PassManagerBuilder *unwrap(LLVMPassManagerBuilderRef P) {
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 752f79d..c608f84 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -891,7 +891,7 @@ static bool checkRippleForAdd(const APInt &Op0KnownZero,
 /// This basically requires proving that the add in the original type would not
 /// overflow to change the sign bit or have a carry out.
 bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
-                                            Instruction *CxtI) {
+                                            Instruction &CxtI) {
   // There are different heuristics we can use for this.  Here are some simple
   // ones.
 
@@ -909,18 +909,18 @@ bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
   //
   // Since the carry into the most significant position is always equal to
   // the carry out of the addition, there is no signed overflow.
-  if (ComputeNumSignBits(LHS, 0, CxtI) > 1 &&
-      ComputeNumSignBits(RHS, 0, CxtI) > 1)
+  if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
+      ComputeNumSignBits(RHS, 0, &CxtI) > 1)
     return true;
 
   unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
   APInt LHSKnownZero(BitWidth, 0);
   APInt LHSKnownOne(BitWidth, 0);
-  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI);
+  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &CxtI);
 
   APInt RHSKnownZero(BitWidth, 0);
   APInt RHSKnownOne(BitWidth, 0);
-  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI);
+  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI);
 
   // Addition of two 2's compliment numbers having opposite signs will never
   // overflow.
@@ -943,21 +943,21 @@ bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
 /// overflow to change the sign bit or have a carry out.
 /// TODO: Handle this for Vectors.
 bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
-                                            Instruction *CxtI) {
+                                            Instruction &CxtI) {
   // If LHS and RHS each have at least two sign bits, the subtraction
   // cannot overflow.
-  if (ComputeNumSignBits(LHS, 0, CxtI) > 1 &&
-      ComputeNumSignBits(RHS, 0, CxtI) > 1)
+  if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
+      ComputeNumSignBits(RHS, 0, &CxtI) > 1)
     return true;
 
   unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
   APInt LHSKnownZero(BitWidth, 0);
   APInt LHSKnownOne(BitWidth, 0);
-  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI);
+  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &CxtI);
 
   APInt RHSKnownZero(BitWidth, 0);
   APInt RHSKnownOne(BitWidth, 0);
-  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI);
+  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI);
 
   // Subtraction of two 2's compliment numbers having identical signs will
   // never overflow.
@@ -972,12 +972,14 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
 /// \brief Return true if we can prove that:
 ///    (sub LHS, RHS)  === (sub nuw LHS, RHS)
 bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
-                                              Instruction *CxtI) {
+                                              Instruction &CxtI) {
   // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
   bool LHSKnownNonNegative, LHSKnownNegative;
   bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0, CxtI);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0, CxtI);
+  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0,
+                 &CxtI);
+  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0,
+                 &CxtI);
   if (LHSKnownNegative && RHSKnownNonNegative)
     return true;
 
@@ -1046,15 +1048,15 @@ static Value *checkForNegativeOperand(BinaryOperator &I,
 }
 
 Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
-   bool Changed = SimplifyAssociativeOrCommutative(I);
-   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
-   if (Value *V = SimplifyVectorOp(I))
-     return ReplaceInstUsesWith(I, V);
+  if (Value *V = SimplifyVectorOp(I))
+    return ReplaceInstUsesWith(I, V);
 
-   if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
-                                  I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
-     return ReplaceInstUsesWith(I, V);
+  if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
+                                 I.hasNoUnsignedWrap(), DL, TLI, DT, AC))
+    return ReplaceInstUsesWith(I, V);
 
    // (A*B)+(A*C) -> A*(B+C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
@@ -1243,7 +1245,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, &I)) {
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
         // Insert the new, smaller add.
         Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               CI, "addconv");
@@ -1256,10 +1258,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       // Only do this if x/y have the same type, if at last one of them has a
       // single use (so we don't increase the number of sexts), and if the
       // integer add will not overflow.
-      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+      if (LHSConv->getOperand(0)->getType() ==
+              RHSConv->getOperand(0)->getType() &&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
           WillNotOverflowSignedAdd(LHSConv->getOperand(0),
-                                   RHSConv->getOperand(0), &I)) {
+                                   RHSConv->getOperand(0), I)) {
         // Insert the new integer add.
         Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                              RHSConv->getOperand(0), "addconv");
@@ -1307,7 +1310,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   // TODO(jingyue): Consider WillNotOverflowSignedAdd and
   // WillNotOverflowUnsignedAdd to reduce the number of invocations of
   // computeKnownBits.
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, &I)) {
+  if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
@@ -1371,7 +1374,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
       ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, &I)) {
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
         // Insert the new integer add.
         Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               CI, "addconv");
@@ -1384,10 +1387,11 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
       // Only do this if x/y have the same type, if at last one of them has a
       // single use (so we don't increase the number of int->fp conversions),
       // and if the integer add will not overflow.
-      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+      if (LHSConv->getOperand(0)->getType() ==
+              RHSConv->getOperand(0)->getType() &&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
           WillNotOverflowSignedAdd(LHSConv->getOperand(0),
-                                   RHSConv->getOperand(0), &I)) {
+                                   RHSConv->getOperand(0), I)) {
         // Insert the new integer add.
         Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               RHSConv->getOperand(0),"addconv");
@@ -1436,8 +1440,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 ///
 Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
                                                Type *Ty) {
-  assert(DL && "Must have target data info for this");
-
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
   // this.
   bool Swapped = false;
@@ -1662,26 +1664,24 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
   // Optimize pointer differences into the same array into a size.  Consider:
   //  &A[10] - &A[0]: we should compile this to "10".
-  if (DL) {
-    Value *LHSOp, *RHSOp;
-    if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
-        match(Op1, m_PtrToInt(m_Value(RHSOp))))
-      if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
-        return ReplaceInstUsesWith(I, Res);
-
-    // trunc(p)-trunc(q) -> trunc(p-q)
-    if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
-        match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
-      if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
-        return ReplaceInstUsesWith(I, Res);
-      }
+  Value *LHSOp, *RHSOp;
+  if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
+      match(Op1, m_PtrToInt(m_Value(RHSOp))))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+      return ReplaceInstUsesWith(I, Res);
+
+  // trunc(p)-trunc(q) -> trunc(p-q)
+  if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
+      match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+      return ReplaceInstUsesWith(I, Res);
 
   bool Changed = false;
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedSub(Op0, Op1, &I)) {
+  if (!I.hasNoSignedWrap() && WillNotOverflowSignedSub(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
-  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedSub(Op0, Op1, &I)) {
+  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedSub(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 863eeaf..ee21c81 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -979,9 +979,9 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   // Make a constant range that's the intersection of the two icmp ranges.
   // If the intersection is empty, we know that the result is false.
   ConstantRange LHSRange =
-    ConstantRange::makeICmpRegion(LHSCC, LHSCst->getValue());
+      ConstantRange::makeAllowedICmpRegion(LHSCC, LHSCst->getValue());
   ConstantRange RHSRange =
-    ConstantRange::makeICmpRegion(RHSCC, RHSCst->getValue());
+      ConstantRange::makeAllowedICmpRegion(RHSCC, RHSCst->getValue());
 
   if (LHSRange.intersectWith(RHSRange).isEmptySet())
     return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
@@ -1709,15 +1709,17 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
       Value *Mask = nullptr;
       Value *Masked = nullptr;
       if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
-          isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, AC, CxtI, DT) &&
-          isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, AC, CxtI, DT)) {
+          isKnownToBeAPowerOfTwo(LAnd->getOperand(1), DL, false, 0, AC, CxtI,
+                                 DT) &&
+          isKnownToBeAPowerOfTwo(RAnd->getOperand(1), DL, false, 0, AC, CxtI,
+                                 DT)) {
         Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
         Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
       } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
-                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, AC, CxtI,
-                                        DT) &&
-                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, AC, CxtI,
-                                        DT)) {
+                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0), DL, false, 0, AC,
+                                        CxtI, DT) &&
+                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0), DL, false, 0, AC,
+                                        CxtI, DT)) {
         Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
         Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
       }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 05e7162..21243c2 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/CallSite.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Statepoint.h"
@@ -61,8 +60,8 @@ static Type *reduceToSingleValueType(Type *T) {
 }
 
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
-  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, AC, MI, DT);
-  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, AC, MI, DT);
+  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, AC, DT);
+  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, AC, DT);
   unsigned MinAlign = std::min(DstAlign, SrcAlign);
   unsigned CopyAlign = MI->getAlignment();
 
@@ -108,7 +107,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   if (StrippedDest != MI->getArgOperand(0)) {
     Type *SrcETy = cast<PointerType>(StrippedDest->getType())
                                     ->getElementType();
-    if (DL && SrcETy->isSized() && DL->getTypeStoreSize(SrcETy) == Size) {
+    if (SrcETy->isSized() && DL.getTypeStoreSize(SrcETy) == Size) {
       // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
       // down through these levels if so.
       SrcETy = reduceToSingleValueType(SrcETy);
@@ -156,7 +155,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
 }
 
 Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
-  unsigned Alignment = getKnownAlignment(MI->getDest(), DL, AC, MI, DT);
+  unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, AC, DT);
   if (MI->getAlignment() < Alignment) {
     MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
                                              Alignment, false));
@@ -198,6 +197,71 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
   return nullptr;
 }
 
+/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
+/// source vectors, unless a zero bit is set. If a zero bit is set,
+/// then ignore that half of the mask and clear that half of the vector.
+static Value *SimplifyX86vperm2(const IntrinsicInst &II,
+                                InstCombiner::BuilderTy &Builder) {
+  if (auto CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
+    VectorType *VecTy = cast<VectorType>(II.getType());
+    ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
+
+    // The immediate permute control byte looks like this:
+    //    [1:0] - select 128 bits from sources for low half of destination
+    //    [2]   - ignore
+    //    [3]   - zero low half of destination
+    //    [5:4] - select 128 bits from sources for high half of destination
+    //    [6]   - ignore
+    //    [7]   - zero high half of destination
+
+    uint8_t Imm = CInt->getZExtValue();
+
+    bool LowHalfZero = Imm & 0x08;
+    bool HighHalfZero = Imm & 0x80;
+
+    // If both zero mask bits are set, this was just a weird way to
+    // generate a zero vector.
+    if (LowHalfZero && HighHalfZero)
+      return ZeroVector;
+
+    // If 0 or 1 zero mask bits are set, this is a simple shuffle.
+    unsigned NumElts = VecTy->getNumElements();
+    unsigned HalfSize = NumElts / 2;
+    SmallVector<int, 8> ShuffleMask(NumElts);
+
+    // The high bit of the selection field chooses the 1st or 2nd operand.
+    bool LowInputSelect = Imm & 0x02;
+    bool HighInputSelect = Imm & 0x20;
+    
+    // The low bit of the selection field chooses the low or high half
+    // of the selected operand.
+    bool LowHalfSelect = Imm & 0x01;
+    bool HighHalfSelect = Imm & 0x10;
+
+    // Determine which operand(s) are actually in use for this instruction.
+    Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
+    Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
+    
+    // If needed, replace operands based on zero mask.
+    V0 = LowHalfZero ? ZeroVector : V0;
+    V1 = HighHalfZero ? ZeroVector : V1;
+    
+    // Permute low half of result.
+    unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
+    for (unsigned i = 0; i < HalfSize; ++i)
+      ShuffleMask[i] = StartIndex + i;
+
+    // Permute high half of result.
+    StartIndex = HighHalfSelect ? HalfSize : 0;
+    StartIndex += NumElts;
+    for (unsigned i = 0; i < HalfSize; ++i)
+      ShuffleMask[i + HalfSize] = StartIndex + i;
+
+    return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+  }
+  return nullptr;
+}
+
 /// visitCallInst - CallInst simplification.  This mostly only handles folding
 /// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
 /// the heavy lifting.
@@ -386,7 +450,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // can prove that it will never overflow.
     if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow) {
       Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-      if (WillNotOverflowSignedAdd(LHS, RHS, II)) {
+      if (WillNotOverflowSignedAdd(LHS, RHS, *II)) {
         return CreateOverflowTuple(II, Builder->CreateNSWAdd(LHS, RHS), false);
       }
     }
@@ -407,11 +471,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       }
     }
     if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow) {
-      if (WillNotOverflowSignedSub(LHS, RHS, II)) {
+      if (WillNotOverflowSignedSub(LHS, RHS, *II)) {
         return CreateOverflowTuple(II, Builder->CreateNSWSub(LHS, RHS), false);
       }
     } else {
-      if (WillNotOverflowUnsignedSub(LHS, RHS, II)) {
+      if (WillNotOverflowUnsignedSub(LHS, RHS, *II)) {
         return CreateOverflowTuple(II, Builder->CreateNUWSub(LHS, RHS), false);
       }
     }
@@ -452,7 +516,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     if (II->getIntrinsicID() == Intrinsic::smul_with_overflow) {
       Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-      if (WillNotOverflowSignedMul(LHS, RHS, II)) {
+      if (WillNotOverflowSignedMul(LHS, RHS, *II)) {
         return CreateOverflowTuple(II, Builder->CreateNSWMul(LHS, RHS), false);
       }
     }
@@ -544,7 +608,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
     // Turn PPC lvx -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >=
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >=
         16) {
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
@@ -561,7 +625,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
     // Turn stvx -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, AC, II, DT) >=
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, AC, DT) >=
         16) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
@@ -578,7 +642,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   }
   case Intrinsic::ppc_qpx_qvlfs:
     // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >=
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >=
         16) {
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
@@ -587,7 +651,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::ppc_qpx_qvlfd:
     // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, AC, II, DT) >=
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, AC, DT) >=
         32) {
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
@@ -596,7 +660,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::ppc_qpx_qvstfs:
     // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, AC, II, DT) >=
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, AC, DT) >=
         16) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
@@ -606,7 +670,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::ppc_qpx_qvstfd:
     // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, AC, II, DT) >=
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, AC, DT) >=
         32) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
@@ -618,7 +682,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_sse2_storeu_pd:
   case Intrinsic::x86_sse2_storeu_dq:
     // Turn X86 storeu -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, AC, II, DT) >=
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, AC, DT) >=
         16) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(1)->getType());
@@ -735,9 +799,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     unsigned LowHalfElts = VWidth / 2;
     APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts));
     APInt UndefElts(VWidth, 0);
-    if (Value *TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0),
-                                                 InputDemandedElts,
-                                                 UndefElts)) {
+    if (Value *TmpV = SimplifyDemandedVectorElts(
+            II->getArgOperand(0), InputDemandedElts, UndefElts)) {
       II->setArgOperand(0, TmpV);
       return II;
     }
@@ -906,6 +969,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     return ReplaceInstUsesWith(CI, Shuffle);
   }
 
+  case Intrinsic::x86_avx_vperm2f128_pd_256:
+  case Intrinsic::x86_avx_vperm2f128_ps_256:
+  case Intrinsic::x86_avx_vperm2f128_si_256:
+  case Intrinsic::x86_avx2_vperm2i128:
+    if (Value *V = SimplifyX86vperm2(*II, *Builder))
+      return ReplaceInstUsesWith(*II, V);
+    break;
+
   case Intrinsic::ppc_altivec_vperm:
     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
     // Note that ppc_altivec_vperm has a big-endian bias, so when creating
@@ -945,12 +1016,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           unsigned Idx =
             cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
           Idx &= 31;  // Match the hardware behavior.
-          if (DL && DL->isLittleEndian())
+          if (DL.isLittleEndian())
             Idx = 31 - Idx;
 
           if (!ExtractedElts[Idx]) {
-            Value *Op0ToUse = (DL && DL->isLittleEndian()) ? Op1 : Op0;
-            Value *Op1ToUse = (DL && DL->isLittleEndian()) ? Op0 : Op1;
+            Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
+            Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
             ExtractedElts[Idx] =
               Builder->CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
                                             Builder->getInt32(Idx&15));
@@ -979,7 +1050,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::arm_neon_vst2lane:
   case Intrinsic::arm_neon_vst3lane:
   case Intrinsic::arm_neon_vst4lane: {
-    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, AC, II, DT);
+    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, AC, DT);
     unsigned AlignArg = II->getNumArgOperands() - 1;
     ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
     if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
@@ -1118,7 +1189,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           RHS->getType()->isPointerTy() &&
           cast<Constant>(RHS)->isNullValue()) {
         LoadInst* LI = cast<LoadInst>(LHS);
-        if (isValidAssumeForContext(II, LI, DL, DT)) {
+        if (isValidAssumeForContext(II, LI, DT)) {
           MDNode *MD = MDNode::get(II->getContext(), None);
           LI->setMetadata(LLVMContext::MD_nonnull, MD);
           return EraseInstFromFunction(*II);
@@ -1192,8 +1263,8 @@ Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
 /// isSafeToEliminateVarargsCast - If this cast does not affect the value
 /// passed through the varargs area, we can eliminate the use of the cast.
 static bool isSafeToEliminateVarargsCast(const CallSite CS,
-                                         const CastInst * const CI,
-                                         const DataLayout * const DL,
+                                         const DataLayout &DL,
+                                         const CastInst *const CI,
                                          const int ix) {
   if (!CI->isLosslessCast())
     return false;
@@ -1217,7 +1288,7 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
   Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
   if (!SrcTy->isSized() || !DstTy->isSized())
     return false;
-  if (!DL || DL->getTypeAllocSize(SrcTy) != DL->getTypeAllocSize(DstTy))
+  if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy))
     return false;
   return true;
 }
@@ -1226,7 +1297,7 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
 // Currently we're only working with the checking functions, memcpy_chk,
 // mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk,
 // strcat_chk and strncat_chk.
-Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *DL) {
+Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
   if (!CI->getCalledFunction()) return nullptr;
 
   auto InstCombineRAUW = [this](Instruction *From, Value *With) {
@@ -1391,7 +1462,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(),
            E = CS.arg_end(); I != E; ++I, ++ix) {
       CastInst *CI = dyn_cast<CastInst>(*I);
-      if (CI && isSafeToEliminateVarargsCast(CS, CI, DL, ix)) {
+      if (CI && isSafeToEliminateVarargsCast(CS, DL, CI, ix)) {
         *I = CI->getOperand(0);
         Changed = true;
       }
@@ -1408,7 +1479,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   // this.  None of these calls are seen as possibly dead so go ahead and
   // delete the instruction now.
   if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
-    Instruction *I = tryOptimizeCall(CI, DL);
+    Instruction *I = tryOptimizeCall(CI);
     // If we changed something return the result, etc. Otherwise let
     // the fallthrough check.
     if (I) return EraseInstFromFunction(*I);
@@ -1487,7 +1558,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   //
   // into:
   //  call void @takes_i32_inalloca(i32* null)
-  if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca))
+  //
+  //  Similarly, avoid folding away bitcasts of byval calls.
+  if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+      Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
     return false;
 
   CallSite::arg_iterator AI = CS.arg_begin();
@@ -1512,12 +1586,12 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
                                                          Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
-      if (!ParamPTy || !ParamPTy->getElementType()->isSized() || !DL)
+      if (!ParamPTy || !ParamPTy->getElementType()->isSized())
         return false;
 
       Type *CurElTy = ActTy->getPointerElementType();
-      if (DL->getTypeAllocSize(CurElTy) !=
-          DL->getTypeAllocSize(ParamPTy->getElementType()))
+      if (DL.getTypeAllocSize(CurElTy) !=
+          DL.getTypeAllocSize(ParamPTy->getElementType()))
         return false;
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 3e2b719..fe544c2 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -80,9 +80,6 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
 /// try to eliminate the cast by moving the type information into the alloc.
 Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
                                                    AllocaInst &AI) {
-  // This requires DataLayout to get the alloca alignment and size information.
-  if (!DL) return nullptr;
-
   PointerType *PTy = cast<PointerType>(CI.getType());
 
   BuilderTy AllocaBuilder(*Builder);
@@ -93,8 +90,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   Type *CastElTy = PTy->getElementType();
   if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
 
-  unsigned AllocElTyAlign = DL->getABITypeAlignment(AllocElTy);
-  unsigned CastElTyAlign = DL->getABITypeAlignment(CastElTy);
+  unsigned AllocElTyAlign = DL.getABITypeAlignment(AllocElTy);
+  unsigned CastElTyAlign = DL.getABITypeAlignment(CastElTy);
   if (CastElTyAlign < AllocElTyAlign) return nullptr;
 
   // If the allocation has multiple uses, only promote it if we are strictly
@@ -102,14 +99,14 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   // same, we open the door to infinite loops of various kinds.
   if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
 
-  uint64_t AllocElTySize = DL->getTypeAllocSize(AllocElTy);
-  uint64_t CastElTySize = DL->getTypeAllocSize(CastElTy);
+  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy);
+  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy);
   if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
 
   // If the allocation has multiple uses, only promote it if we're not
   // shrinking the amount of memory being allocated.
-  uint64_t AllocElTyStoreSize = DL->getTypeStoreSize(AllocElTy);
-  uint64_t CastElTyStoreSize = DL->getTypeStoreSize(CastElTy);
+  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy);
+  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy);
   if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
 
   // See if we can satisfy the modulus by pulling a scale out of the array
@@ -215,7 +212,8 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     PHINode *OPN = cast<PHINode>(I);
     PHINode *NPN = PHINode::Create(Ty, OPN->getNumIncomingValues());
     for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) {
-      Value *V =EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
+      Value *V =
+          EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
       NPN->addIncoming(V, OPN->getIncomingBlock(i));
     }
     Res = NPN;
@@ -234,25 +232,22 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
 /// This function is a wrapper around CastInst::isEliminableCastPair. It
 /// simply extracts arguments and returns what that function returns.
 static Instruction::CastOps
-isEliminableCastPair(
-  const CastInst *CI, ///< The first cast instruction
-  unsigned opcode,       ///< The opcode of the second cast instruction
-  Type *DstTy,     ///< The target type for the second cast instruction
-  const DataLayout *DL ///< The target data for pointer size
-) {
-
+isEliminableCastPair(const CastInst *CI, ///< First cast instruction
+                     unsigned opcode,    ///< Opcode for the second cast
+                     Type *DstTy,        ///< Target type for the second cast
+                     const DataLayout &DL) {
   Type *SrcTy = CI->getOperand(0)->getType();   // A from above
   Type *MidTy = CI->getType();                  // B from above
 
   // Get the opcodes of the two Cast instructions
   Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
   Instruction::CastOps secondOp = Instruction::CastOps(opcode);
-  Type *SrcIntPtrTy = DL && SrcTy->isPtrOrPtrVectorTy() ?
-    DL->getIntPtrType(SrcTy) : nullptr;
-  Type *MidIntPtrTy = DL && MidTy->isPtrOrPtrVectorTy() ?
-    DL->getIntPtrType(MidTy) : nullptr;
-  Type *DstIntPtrTy = DL && DstTy->isPtrOrPtrVectorTy() ?
-    DL->getIntPtrType(DstTy) : nullptr;
+  Type *SrcIntPtrTy =
+      SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr;
+  Type *MidIntPtrTy =
+      MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr;
+  Type *DstIntPtrTy =
+      DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr;
   unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
                                                 DstTy, SrcIntPtrTy, MidIntPtrTy,
                                                 DstIntPtrTy);
@@ -298,7 +293,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   // eliminate it now.
   if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
     if (Instruction::CastOps opc =
-        isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), DL)) {
+            isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), DL)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
       return CastInst::Create(opc, CSrc->getOperand(0), CI.getType());
@@ -314,8 +309,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   if (isa<PHINode>(Src)) {
     // We don't do this if this would create a PHI node with an illegal type if
     // it is currently legal.
-    if (!Src->getType()->isIntegerTy() ||
-        !CI.getType()->isIntegerTy() ||
+    if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
         ShouldChangeType(CI.getType(), Src->getType()))
       if (Instruction *NV = FoldOpIntoPhi(CI))
         return NV;
@@ -1419,18 +1413,15 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   // If the source integer type is not the intptr_t type for this target, do a
   // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
   // cast to be exposed to other transforms.
-
-  if (DL) {
-    unsigned AS = CI.getAddressSpace();
-    if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
-        DL->getPointerSizeInBits(AS)) {
-      Type *Ty = DL->getIntPtrType(CI.getContext(), AS);
-      if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
-        Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
-
-      Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
-      return new IntToPtrInst(P, CI.getType());
-    }
+  unsigned AS = CI.getAddressSpace();
+  if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+      DL.getPointerSizeInBits(AS)) {
+    Type *Ty = DL.getIntPtrType(CI.getContext(), AS);
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
+    return new IntToPtrInst(P, CI.getType());
   }
 
   if (Instruction *I = commonCastTransforms(CI))
@@ -1460,32 +1451,33 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
       return &CI;
     }
 
-    if (!DL)
-      return commonCastTransforms(CI);
-
     // If the GEP has a single use, and the base pointer is a bitcast, and the
     // GEP computes a constant offset, see if we can convert these three
     // instructions into fewer.  This typically happens with unions and other
     // non-type-safe code.
     unsigned AS = GEP->getPointerAddressSpace();
-    unsigned OffsetBits = DL->getPointerSizeInBits(AS);
+    unsigned OffsetBits = DL.getPointerSizeInBits(AS);
     APInt Offset(OffsetBits, 0);
     BitCastInst *BCI = dyn_cast<BitCastInst>(GEP->getOperand(0));
-    if (GEP->hasOneUse() &&
-        BCI &&
-        GEP->accumulateConstantOffset(*DL, Offset)) {
+    if (GEP->hasOneUse() && BCI && GEP->accumulateConstantOffset(DL, Offset)) {
+      // FIXME: This is insufficiently tested - just a no-crash test
+      // (test/Transforms/InstCombine/2007-05-14-Crash.ll)
+      //
       // Get the base pointer input of the bitcast, and the type it points to.
       Value *OrigBase = BCI->getOperand(0);
       SmallVector<Value*, 8> NewIndices;
-      if (FindElementAtOffset(OrigBase->getType(),
-                              Offset.getSExtValue(),
+      if (FindElementAtOffset(OrigBase->getType(), Offset.getSExtValue(),
                               NewIndices)) {
+        // FIXME: This codepath is completely untested - could be unreachable
+        // for all I know.
         // If we were able to index down into an element, create the GEP
         // and bitcast the result.  This eliminates one bitcast, potentially
         // two.
-        Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ?
-          Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
-          Builder->CreateGEP(OrigBase, NewIndices);
+        Value *NGEP = cast<GEPOperator>(GEP)->isInBounds()
+                          ? Builder->CreateInBoundsGEP(OrigBase, NewIndices)
+                          : Builder->CreateGEP(
+                                OrigBase->getType()->getPointerElementType(),
+                                OrigBase, NewIndices);
         NGEP->takeName(GEP);
 
         if (isa<BitCastInst>(CI))
@@ -1504,16 +1496,13 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
   // to be exposed to other transforms.
 
-  if (!DL)
-    return commonPointerCastTransforms(CI);
-
   Type *Ty = CI.getType();
   unsigned AS = CI.getPointerAddressSpace();
 
-  if (Ty->getScalarSizeInBits() == DL->getPointerSizeInBits(AS))
+  if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS))
     return commonPointerCastTransforms(CI);
 
-  Type *PtrTy = DL->getIntPtrType(CI.getContext(), AS);
+  Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS);
   if (Ty->isVectorTy()) // Handle vectors of pointers.
     PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements());
 
@@ -1597,8 +1586,8 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
 /// This returns false if the pattern can't be matched or true if it can,
 /// filling in Elements with the elements found here.
 static bool CollectInsertionElements(Value *V, unsigned Shift,
-                                     SmallVectorImpl<Value*> &Elements,
-                                     Type *VecEltTy, InstCombiner &IC) {
+                                     SmallVectorImpl<Value *> &Elements,
+                                     Type *VecEltTy, bool isBigEndian) {
   assert(isMultipleOfTypeSize(Shift, VecEltTy) &&
          "Shift should be a multiple of the element type size");
 
@@ -1614,7 +1603,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
         return true;
 
     unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy);
-    if (IC.getDataLayout()->isBigEndian())
+    if (isBigEndian)
       ElementIndex = Elements.size() - ElementIndex - 1;
 
     // Fail if multiple elements are inserted into this slot.
@@ -1634,7 +1623,7 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
     // it to the right type so it gets properly inserted.
     if (NumElts == 1)
       return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
-                                      Shift, Elements, VecEltTy, IC);
+                                      Shift, Elements, VecEltTy, isBigEndian);
 
     // Okay, this is a constant that covers multiple elements.  Slice it up into
     // pieces and insert each element-sized piece into the vector.
@@ -1649,7 +1638,8 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
                                                                   ShiftI));
       Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
-      if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, IC))
+      if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy,
+                                    isBigEndian))
         return false;
     }
     return true;
@@ -1662,28 +1652,28 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
-    return CollectInsertionElements(I->getOperand(0), Shift,
-                                    Elements, VecEltTy, IC);
+    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian);
   case Instruction::ZExt:
     if (!isMultipleOfTypeSize(
                           I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
                               VecEltTy))
       return false;
-    return CollectInsertionElements(I->getOperand(0), Shift,
-                                    Elements, VecEltTy, IC);
+    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian);
   case Instruction::Or:
-    return CollectInsertionElements(I->getOperand(0), Shift,
-                                    Elements, VecEltTy, IC) &&
-           CollectInsertionElements(I->getOperand(1), Shift,
-                                    Elements, VecEltTy, IC);
+    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian) &&
+           CollectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy,
+                                    isBigEndian);
   case Instruction::Shl: {
     // Must be shifting by a constant that is a multiple of the element size.
     ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
     if (!CI) return false;
     Shift += CI->getZExtValue();
     if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
-    return CollectInsertionElements(I->getOperand(0), Shift,
-                                    Elements, VecEltTy, IC);
+    return CollectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+                                    isBigEndian);
   }
 
   }
@@ -1706,15 +1696,13 @@ static bool CollectInsertionElements(Value *V, unsigned Shift,
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
 static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
                                                 InstCombiner &IC) {
-  // We need to know the target byte order to perform this optimization.
-  if (!IC.getDataLayout()) return nullptr;
-
   VectorType *DestVecTy = cast<VectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
 
   SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
   if (!CollectInsertionElements(IntInput, 0, Elements,
-                                DestVecTy->getElementType(), IC))
+                                DestVecTy->getElementType(),
+                                IC.getDataLayout().isBigEndian()))
     return nullptr;
 
   // If we succeeded, we know that all of the element are specified by Elements
@@ -1734,10 +1722,8 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
 
 /// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double
 /// bitcast.  The various long double bitcasts can't get in here.
-static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
-  // We need to know the target byte order to perform this optimization.
-  if (!IC.getDataLayout()) return nullptr;
-
+static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI, InstCombiner &IC,
+                                              const DataLayout &DL) {
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType();
 
@@ -1760,7 +1746,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
       }
 
       unsigned Elt = 0;
-      if (IC.getDataLayout()->isBigEndian())
+      if (DL.isBigEndian())
         Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1;
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
@@ -1784,7 +1770,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
       }
 
       unsigned Elt = ShAmt->getZExtValue() / DestWidth;
-      if (IC.getDataLayout()->isBigEndian())
+      if (DL.isBigEndian())
         Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt;
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
@@ -1839,7 +1825,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
 
   // Try to optimize int -> float bitcasts.
   if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy))
-    if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
+    if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this, DL))
       return I;
 
   if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index f48d89b..803b50a 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -229,10 +229,6 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
 Instruction *InstCombiner::
 FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
                              CmpInst &ICI, ConstantInt *AndCst) {
-  // We need TD information to know the pointer size unless this is inbounds.
-  if (!GEP->isInBounds() && !DL)
-    return nullptr;
-
   Constant *Init = GV->getInitializer();
   if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
     return nullptr;
@@ -303,7 +299,6 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // the array, this will fully represent all the comparison results.
   uint64_t MagicBitvector = 0;
 
-
   // Scan the array and see if one of our patterns matches.
   Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
   for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
@@ -398,7 +393,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // index down like the GEP would do implicitly.  We don't have to do this for
   // an inbounds GEP because the index can't be out of range.
   if (!GEP->isInBounds()) {
-    Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+    Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
     unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
     if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize)
       Idx = Builder->CreateTrunc(Idx, IntPtrTy);
@@ -487,10 +482,8 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
     // - Default to i32
     if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
       Ty = Idx->getType();
-    else if (DL)
-      Ty = DL->getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
-    else if (ArrayElementCount <= 32)
-      Ty = Type::getInt32Ty(Init->getContext());
+    else
+      Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
 
     if (Ty) {
       Value *V = Builder->CreateIntCast(Idx, Ty, false);
@@ -514,8 +507,8 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
 ///
 /// If we can't emit an optimized form for this expression, this returns null.
 ///
-static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
-  const DataLayout &DL = *IC.getDataLayout();
+static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
+                                          const DataLayout &DL) {
   gep_type_iterator GTI = gep_type_begin(GEP);
 
   // Check to see if this gep only has a single variable index.  If so, and if
@@ -628,12 +621,12 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     RHS = RHS->stripPointerCasts();
 
   Value *PtrBase = GEPLHS->getOperand(0);
-  if (DL && PtrBase == RHS && GEPLHS->isInBounds()) {
+  if (PtrBase == RHS && GEPLHS->isInBounds()) {
     // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
     // This transformation (ignoring the base and scales) is valid because we
     // know pointers can't overflow since the gep is inbounds.  See if we can
     // output an optimized form.
-    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this);
+    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this, DL);
 
     // If not, synthesize the offset the hard way.
     if (!Offset)
@@ -661,11 +654,11 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       // If we're comparing GEPs with two base pointers that only differ in type
       // and both GEPs have only constant indices or just one use, then fold
       // the compare with the adjusted indices.
-      if (DL && GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
+      if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
           (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
           (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
           PtrBase->stripPointerCasts() ==
-            GEPRHS->getOperand(0)->stripPointerCasts()) {
+              GEPRHS->getOperand(0)->stripPointerCasts()) {
         Value *LOffset = EmitGEPOffset(GEPLHS);
         Value *ROffset = EmitGEPOffset(GEPRHS);
 
@@ -733,9 +726,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
     // Only lower this if the icmp is the only user of the GEP or if we expect
     // the result to fold to a constant!
-    if (DL &&
-        GEPsInBounds &&
-        (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
+    if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
         (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
       // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
       Value *L = EmitGEPOffset(GEPLHS);
@@ -1928,8 +1919,8 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
 
   // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
   // integer type is the same size as the pointer type.
-  if (DL && LHSCI->getOpcode() == Instruction::PtrToInt &&
-      DL->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
+  if (LHSCI->getOpcode() == Instruction::PtrToInt &&
+      DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
     Value *RHSOp = nullptr;
     if (PtrToIntOperator *RHSC = dyn_cast<PtrToIntOperator>(ICI.getOperand(1))) {
       Value *RHSCIOp = RHSC->getOperand(0);
@@ -2660,8 +2651,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   unsigned BitWidth = 0;
   if (Ty->isIntOrIntVectorTy())
     BitWidth = Ty->getScalarSizeInBits();
-  else if (DL)  // Pointers require DL info to get their size.
-    BitWidth = DL->getTypeSizeInBits(Ty->getScalarType());
+  else // Get pointer size.
+    BitWidth = DL.getTypeSizeInBits(Ty->getScalarType());
 
   bool isSignBit = false;
 
@@ -2774,8 +2765,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                              Op0KnownZero, Op0KnownOne, 0))
       return &I;
     if (SimplifyDemandedBits(I.getOperandUse(1),
-                             APInt::getAllOnesValue(BitWidth),
-                             Op1KnownZero, Op1KnownOne, 0))
+                             APInt::getAllOnesValue(BitWidth), Op1KnownZero,
+                             Op1KnownOne, 0))
       return &I;
 
     // Given the known and unknown bits, compute a range that the LHS could be
@@ -3094,9 +3085,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       }
       case Instruction::IntToPtr:
         // icmp pred inttoptr(X), null -> icmp pred X, 0
-        if (RHSC->isNullValue() && DL &&
-            DL->getIntPtrType(RHSC->getType()) ==
-               LHSI->getOperand(0)->getType())
+        if (RHSC->isNullValue() &&
+            DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType())
           return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
                         Constant::getNullValue(LHSI->getOperand(0)->getType()));
         break;
@@ -3428,7 +3418,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // if A is a power of 2.
     if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
         match(Op1, m_Zero()) &&
-        isKnownToBeAPowerOfTwo(A, false, 0, AC, &I, DT) && I.isEquality())
+        isKnownToBeAPowerOfTwo(A, DL, false, 0, AC, &I, DT) && I.isEquality())
       return new ICmpInst(I.getInversePredicate(),
                           Builder->CreateAnd(A, B),
                           Op1);
@@ -3563,6 +3553,21 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       }
     }
 
+    // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0
+    if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) &&
+        match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) {
+      unsigned TypeBits = Cst1->getBitWidth();
+      unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+      if (ShAmt < TypeBits && ShAmt != 0) {
+        Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
+        APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
+        Value *And = Builder->CreateAnd(Xor, Builder->getInt(AndVal),
+                                        I.getName() + ".mask");
+        return new ICmpInst(I.getPredicate(), And,
+                            Constant::getNullValue(Cst1->getType()));
+      }
+    }
+
     // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
     // "icmp (and X, mask), cst"
     uint64_t ShAmt = 0;
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 2fd5318..fb2321d 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -158,10 +158,10 @@ private:
   AssumptionCache *AC;
   TargetLibraryInfo *TLI;
   DominatorTree *DT;
+  const DataLayout &DL;
 
   // Optional analyses. When non-null, these can both be used to do better
   // combining and will be updated to reflect any changes.
-  const DataLayout *DL;
   LoopInfo *LI;
 
   bool MadeIRChange;
@@ -169,7 +169,7 @@ private:
 public:
   InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder,
                bool MinimizeSize, AssumptionCache *AC, TargetLibraryInfo *TLI,
-               DominatorTree *DT, const DataLayout *DL, LoopInfo *LI)
+               DominatorTree *DT, const DataLayout &DL, LoopInfo *LI)
       : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
         AC(AC), TLI(TLI), DT(DT), DL(DL), LI(LI), MadeIRChange(false) {}
 
@@ -180,7 +180,7 @@ public:
 
   AssumptionCache *getAssumptionCache() const { return AC; }
 
-  const DataLayout *getDataLayout() const { return DL; }
+  const DataLayout &getDataLayout() const { return DL; }
 
   DominatorTree *getDominatorTree() const { return DT; }
 
@@ -330,17 +330,17 @@ private:
                           Type *Ty);
 
   Instruction *visitCallSite(CallSite CS);
-  Instruction *tryOptimizeCall(CallInst *CI, const DataLayout *DL);
+  Instruction *tryOptimizeCall(CallInst *CI);
   bool transformConstExprCastCall(CallSite CS);
   Instruction *transformCallThroughTrampoline(CallSite CS,
                                               IntrinsicInst *Tramp);
   Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
                                  bool DoXform = true);
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
-  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction *CxtI);
-  bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction *CxtI);
-  bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction *CxtI);
-  bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction *CxtI);
+  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI);
+  bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
+  bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
+  bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI);
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
@@ -372,6 +372,10 @@ public:
   /// I to the worklist, replace all uses of I with the new value, then return
   /// I, so that the inst combiner will know that I was modified.
   Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
+    // If there are no uses to replace, then we return nullptr to indicate that
+    // no changes were made to the program.
+    if (I.use_empty()) return nullptr;
+
     Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist.
 
     // If we are replacing the instruction with itself, this must be in a
@@ -423,7 +427,7 @@ public:
   }
 
   void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
-                        unsigned Depth = 0, Instruction *CxtI = nullptr) const {
+                        unsigned Depth, Instruction *CxtI) const {
     return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth, AC, CxtI,
                                   DT);
   }
@@ -468,7 +472,7 @@ private:
   /// bits.
   Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero,
                                  APInt &KnownOne, unsigned Depth,
-                                 Instruction *CxtI = nullptr);
+                                 Instruction *CxtI);
   bool SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero,
                             APInt &KnownOne, unsigned Depth = 0);
   /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index b9eb986..6b0f268 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -164,62 +164,75 @@ isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
   return nullptr;
 }
 
-Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
-  // Ensure that the alloca array size argument has type intptr_t, so that
-  // any casting is exposed early.
-  if (DL) {
-    Type *IntPtrTy = DL->getIntPtrType(AI.getType());
-    if (AI.getArraySize()->getType() != IntPtrTy) {
-      Value *V = Builder->CreateIntCast(AI.getArraySize(),
-                                        IntPtrTy, false);
-      AI.setOperand(0, V);
-      return &AI;
-    }
+static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
+  // Check for array size of 1 (scalar allocation).
+  if (!AI.isArrayAllocation()) {
+    // i32 1 is the canonical array size for scalar allocations.
+    if (AI.getArraySize()->getType()->isIntegerTy(32))
+      return nullptr;
+
+    // Canonicalize it.
+    Value *V = IC.Builder->getInt32(1);
+    AI.setOperand(0, V);
+    return &AI;
   }
 
   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
-  if (AI.isArrayAllocation()) {  // Check C != 1
-    if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
-      Type *NewTy =
-        ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
-      AllocaInst *New = Builder->CreateAlloca(NewTy, nullptr, AI.getName());
-      New->setAlignment(AI.getAlignment());
-
-      // Scan to the end of the allocation instructions, to skip over a block of
-      // allocas if possible...also skip interleaved debug info
-      //
-      BasicBlock::iterator It = New;
-      while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It;
-
-      // Now that I is pointing to the first non-allocation-inst in the block,
-      // insert our getelementptr instruction...
-      //
-      Type *IdxTy = DL
-                  ? DL->getIntPtrType(AI.getType())
-                  : Type::getInt64Ty(AI.getContext());
-      Value *NullIdx = Constant::getNullValue(IdxTy);
-      Value *Idx[2] = { NullIdx, NullIdx };
-      Instruction *GEP =
+  if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
+    Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
+    AllocaInst *New = IC.Builder->CreateAlloca(NewTy, nullptr, AI.getName());
+    New->setAlignment(AI.getAlignment());
+
+    // Scan to the end of the allocation instructions, to skip over a block of
+    // allocas if possible...also skip interleaved debug info
+    //
+    BasicBlock::iterator It = New;
+    while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
+      ++It;
+
+    // Now that I is pointing to the first non-allocation-inst in the block,
+    // insert our getelementptr instruction...
+    //
+    Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
+    Value *NullIdx = Constant::getNullValue(IdxTy);
+    Value *Idx[2] = {NullIdx, NullIdx};
+    Instruction *GEP =
         GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub");
-      InsertNewInstBefore(GEP, *It);
+    IC.InsertNewInstBefore(GEP, *It);
 
-      // Now make everything use the getelementptr instead of the original
-      // allocation.
-      return ReplaceInstUsesWith(AI, GEP);
-    } else if (isa<UndefValue>(AI.getArraySize())) {
-      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
-    }
+    // Now make everything use the getelementptr instead of the original
+    // allocation.
+    return IC.ReplaceInstUsesWith(AI, GEP);
   }
 
-  if (DL && AI.getAllocatedType()->isSized()) {
+  if (isa<UndefValue>(AI.getArraySize()))
+    return IC.ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+
+  // Ensure that the alloca array size argument has type intptr_t, so that
+  // any casting is exposed early.
+  Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
+  if (AI.getArraySize()->getType() != IntPtrTy) {
+    Value *V = IC.Builder->CreateIntCast(AI.getArraySize(), IntPtrTy, false);
+    AI.setOperand(0, V);
+    return &AI;
+  }
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
+  if (auto *I = simplifyAllocaArraySize(*this, AI))
+    return I;
+
+  if (AI.getAllocatedType()->isSized()) {
     // If the alignment is 0 (unspecified), assign it the preferred alignment.
     if (AI.getAlignment() == 0)
-      AI.setAlignment(DL->getPrefTypeAlignment(AI.getAllocatedType()));
+      AI.setAlignment(DL.getPrefTypeAlignment(AI.getAllocatedType()));
 
     // Move all alloca's of zero byte objects to the entry block and merge them
     // together.  Note that we only do this for alloca's, because malloc should
     // allocate and return a unique pointer, even for a zero byte allocation.
-    if (DL->getTypeAllocSize(AI.getAllocatedType()) == 0) {
+    if (DL.getTypeAllocSize(AI.getAllocatedType()) == 0) {
       // For a zero sized alloca there is no point in doing an array allocation.
       // This is helpful if the array size is a complicated expression not used
       // elsewhere.
@@ -237,7 +250,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         // dominance as the array size was forced to a constant earlier already.
         AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
         if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
-            DL->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
+            DL.getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
           AI.moveBefore(FirstInst);
           return &AI;
         }
@@ -246,7 +259,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         // assign it the preferred alignment.
         if (EntryAI->getAlignment() == 0)
           EntryAI->setAlignment(
-            DL->getPrefTypeAlignment(EntryAI->getAllocatedType()));
+              DL.getPrefTypeAlignment(EntryAI->getAllocatedType()));
         // Replace this zero-sized alloca with the one at the start of the entry
         // block after ensuring that the address will be aligned enough for both
         // types.
@@ -270,7 +283,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     SmallVector<Instruction *, 4> ToDelete;
     if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
       unsigned SourceAlign = getOrEnforceKnownAlignment(
-          Copy->getSource(), AI.getAlignment(), DL, AC, &AI, DT);
+          Copy->getSource(), AI.getAlignment(), DL, &AI, AC, DT);
       if (AI.getAlignment() <= SourceAlign) {
         DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
         DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
@@ -439,22 +452,22 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
     return nullptr;
 
   Type *Ty = LI.getType();
+  const DataLayout &DL = IC.getDataLayout();
 
   // Try to canonicalize loads which are only ever stored to operate over
   // integers instead of any other type. We only do this when the loaded type
   // is sized and has a size exactly the same as its store size and the store
   // size is a legal integer type.
-  const DataLayout *DL = IC.getDataLayout();
-  if (!Ty->isIntegerTy() && Ty->isSized() && DL &&
-      DL->isLegalInteger(DL->getTypeStoreSizeInBits(Ty)) &&
-      DL->getTypeStoreSizeInBits(Ty) == DL->getTypeSizeInBits(Ty)) {
+  if (!Ty->isIntegerTy() && Ty->isSized() &&
+      DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
+      DL.getTypeStoreSizeInBits(Ty) == DL.getTypeSizeInBits(Ty)) {
     if (std::all_of(LI.user_begin(), LI.user_end(), [&LI](User *U) {
           auto *SI = dyn_cast<StoreInst>(U);
           return SI && SI->getPointerOperand() != &LI;
         })) {
       LoadInst *NewLoad = combineLoadToNewType(
           IC, LI,
-          Type::getIntNTy(LI.getContext(), DL->getTypeStoreSizeInBits(Ty)));
+          Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty)));
       // Replace all the stores with stores of the newly loaded value.
       for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) {
         auto *SI = cast<StoreInst>(*UI++);
@@ -489,7 +502,7 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
 //
 // FIXME: This should probably live in ValueTracking (or similar).
 static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
-                                     const DataLayout *DL) {
+                                     const DataLayout &DL) {
   SmallPtrSet<Value *, 4> Visited;
   SmallVector<Value *, 4> Worklist(1, V);
 
@@ -529,7 +542,7 @@ static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
       if (!CS)
         return false;
 
-      uint64_t TypeSize = DL->getTypeAllocSize(AI->getAllocatedType());
+      uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
       // Make sure that, even if the multiplication below would wrap as an
       // uint64_t, we still do the right thing.
       if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize))
@@ -541,7 +554,7 @@ static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
       if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
         return false;
 
-      uint64_t InitSize = DL->getTypeAllocSize(GV->getType()->getElementType());
+      uint64_t InitSize = DL.getTypeAllocSize(GV->getType()->getElementType());
       if (InitSize > MaxSize)
         return false;
       continue;
@@ -570,8 +583,7 @@ static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
 // offsets those indices implied.
 static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
                                      Instruction *MemI, unsigned &Idx) {
-  const DataLayout *DL = IC.getDataLayout();
-  if (GEPI->getNumOperands() < 2 || !DL)
+  if (GEPI->getNumOperands() < 2)
     return false;
 
   // Find the first non-zero index of a GEP. If all indices are zero, return
@@ -603,7 +615,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
     GetElementPtrInst::getIndexedType(GEPI->getOperand(0)->getType(), Ops);
   if (!AllocTy || !AllocTy->isSized())
     return false;
-  uint64_t TyAllocSize = DL->getTypeAllocSize(AllocTy);
+  const DataLayout &DL = IC.getDataLayout();
+  uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy);
 
   // If there are more indices after the one we might replace with a zero, make
   // sure they're all non-negative. If any of them are negative, the overall
@@ -665,18 +678,16 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     return Res;
 
   // Attempt to improve the alignment.
-  if (DL) {
-    unsigned KnownAlign = getOrEnforceKnownAlignment(
-        Op, DL->getPrefTypeAlignment(LI.getType()), DL, AC, &LI, DT);
-    unsigned LoadAlign = LI.getAlignment();
-    unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign :
-      DL->getABITypeAlignment(LI.getType());
-
-    if (KnownAlign > EffectiveLoadAlign)
-      LI.setAlignment(KnownAlign);
-    else if (LoadAlign == 0)
-      LI.setAlignment(EffectiveLoadAlign);
-  }
+  unsigned KnownAlign = getOrEnforceKnownAlignment(
+      Op, DL.getPrefTypeAlignment(LI.getType()), DL, &LI, AC, DT);
+  unsigned LoadAlign = LI.getAlignment();
+  unsigned EffectiveLoadAlign =
+      LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType());
+
+  if (KnownAlign > EffectiveLoadAlign)
+    LI.setAlignment(KnownAlign);
+  else if (LoadAlign == 0)
+    LI.setAlignment(EffectiveLoadAlign);
 
   // Replace GEP indices if possible.
   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
@@ -738,8 +749,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
       unsigned Align = LI.getAlignment();
-      if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align, DL) &&
-          isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align, DL)) {
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align)) {
         LoadInst *V1 = Builder->CreateLoad(SI->getOperand(1),
                                            SI->getOperand(1)->getName()+".val");
         LoadInst *V2 = Builder->CreateLoad(SI->getOperand(2),
@@ -807,6 +818,30 @@ static bool combineStoreToValueType(InstCombiner &IC, StoreInst &SI) {
   return false;
 }
 
+static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
+  // FIXME: We could probably with some care handle both volatile and atomic
+  // stores here but it isn't clear that this is important.
+  if (!SI.isSimple())
+    return false;
+
+  Value *V = SI.getValueOperand();
+  Type *T = V->getType();
+
+  if (!T->isAggregateType())
+    return false;
+
+  if (StructType *ST = dyn_cast<StructType>(T)) {
+    // If the struct only have one element, we unpack.
+    if (ST->getNumElements() == 1) {
+      V = IC.Builder->CreateExtractValue(V, 0);
+      combineStoreToNewValue(IC, SI, V);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /// equivalentAddressValues - Test if A and B will obviously have the same
 /// value. This includes recognizing that %t0 and %t1 will have the same
 /// value in code like this:
@@ -845,18 +880,20 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     return EraseInstFromFunction(SI);
 
   // Attempt to improve the alignment.
-  if (DL) {
-    unsigned KnownAlign = getOrEnforceKnownAlignment(
-        Ptr, DL->getPrefTypeAlignment(Val->getType()), DL, AC, &SI, DT);
-    unsigned StoreAlign = SI.getAlignment();
-    unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign :
-      DL->getABITypeAlignment(Val->getType());
-
-    if (KnownAlign > EffectiveStoreAlign)
-      SI.setAlignment(KnownAlign);
-    else if (StoreAlign == 0)
-      SI.setAlignment(EffectiveStoreAlign);
-  }
+  unsigned KnownAlign = getOrEnforceKnownAlignment(
+      Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, AC, DT);
+  unsigned StoreAlign = SI.getAlignment();
+  unsigned EffectiveStoreAlign =
+      StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType());
+
+  if (KnownAlign > EffectiveStoreAlign)
+    SI.setAlignment(KnownAlign);
+  else if (StoreAlign == 0)
+    SI.setAlignment(EffectiveStoreAlign);
+
+  // Try to canonicalize the stored type.
+  if (unpackStoreToAggregate(*this, SI))
+    return EraseInstFromFunction(SI);
 
   // Replace GEP indices if possible.
   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index c48e3c9..35513f1 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -26,7 +26,7 @@ using namespace PatternMatch;
 /// where it is known to be non-zero.  If this allows us to simplify the
 /// computation, do so and return the new operand, otherwise return null.
 static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
-                                        Instruction *CxtI) {
+                                        Instruction &CxtI) {
   // If V has multiple uses, then we would have to do more analysis to determine
   // if this is safe.  For example, the use could be in dynamically unreached
   // code.
@@ -47,8 +47,8 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
   // inexact.  Similarly for <<.
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
     if (I->isLogicalShift() &&
-        isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0,
-                               IC.getAssumptionCache(), CxtI,
+        isKnownToBeAPowerOfTwo(I->getOperand(0), IC.getDataLayout(), false, 0,
+                               IC.getAssumptionCache(), &CxtI,
                                IC.getDominatorTree())) {
       // We know that this is an exact/nuw shift and that the input is a
       // non-zero context as well.
@@ -126,7 +126,7 @@ static Constant *getLogBase2Vector(ConstantDataVector *CV) {
 /// \brief Return true if we can prove that:
 ///    (mul LHS, RHS)  === (mul nsw LHS, RHS)
 bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
-                                            Instruction *CxtI) {
+                                            Instruction &CxtI) {
   // Multiplying n * m significant bits yields a result of n + m significant
   // bits. If the total number of significant bits does not exceed the
   // result bit width (minus 1), there is no overflow.
@@ -137,8 +137,8 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
 
   // Note that underestimating the number of sign bits gives a more
   // conservative answer.
-  unsigned SignBits = ComputeNumSignBits(LHS, 0, CxtI) +
-                      ComputeNumSignBits(RHS, 0, CxtI);
+  unsigned SignBits =
+      ComputeNumSignBits(LHS, 0, &CxtI) + ComputeNumSignBits(RHS, 0, &CxtI);
 
   // First handle the easy case: if we have enough sign bits there's
   // definitely no overflow.
@@ -157,8 +157,8 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
     // For simplicity we just check if at least one side is not negative.
     bool LHSNonNegative, LHSNegative;
     bool RHSNonNegative, RHSNegative;
-    ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, CxtI);
-    ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, CxtI);
+    ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI);
+    ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI);
     if (LHSNonNegative || RHSNonNegative)
       return true;
   }
@@ -375,7 +375,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedMul(Op0, Op1, &I)) {
+  if (!I.hasNoSignedWrap() && WillNotOverflowSignedMul(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
@@ -422,7 +422,7 @@ static bool isFiniteNonZeroFp(Constant *C) {
   if (C->getType()->isVectorTy()) {
     for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E;
          ++I) {
-      ConstantFP *CFP = dyn_cast<ConstantFP>(C->getAggregateElement(I));
+      ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I));
       if (!CFP || !CFP->getValueAPF().isFiniteNonZero())
         return false;
     }
@@ -437,7 +437,7 @@ static bool isNormalFp(Constant *C) {
   if (C->getType()->isVectorTy()) {
     for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E;
          ++I) {
-      ConstantFP *CFP = dyn_cast<ConstantFP>(C->getAggregateElement(I));
+      ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getAggregateElement(I));
       if (!CFP || !CFP->getValueAPF().isNormal())
         return false;
     }
@@ -780,7 +780,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // The RHS is known non-zero.
-  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, &I)) {
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) {
     I.setOperand(1, V);
     return &I;
   }
@@ -1155,7 +1155,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
         return BO;
       }
 
-      if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, AC, &I, DT)) {
+      if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, AC, &I, DT)) {
         // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
         // Safe because the only negative value (1 << Y) can take on is
         // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
@@ -1338,7 +1338,7 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // The RHS is known non-zero.
-  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, &I)) {
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) {
     I.setOperand(1, V);
     return &I;
   }
@@ -1385,7 +1385,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
                           I.getType());
 
   // X urem Y -> X and Y-1, where Y is a power of 2,
-  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, AC, &I, DT)) {
+  if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, AC, &I, DT)) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
     Value *Add = Builder->CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 0e73db8..ca2caed 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/IR/DataLayout.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "instcombine"
@@ -231,7 +230,8 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
 
   Value *Base = FixedOperands[0];
   GetElementPtrInst *NewGEP =
-    GetElementPtrInst::Create(Base, makeArrayRef(FixedOperands).slice(1));
+      GetElementPtrInst::Create(FirstInst->getSourceElementType(), Base,
+                                makeArrayRef(FixedOperands).slice(1));
   if (AllInBounds) NewGEP->setIsInBounds();
   NewGEP->setDebugLoc(FirstInst->getDebugLoc());
   return NewGEP;
@@ -891,8 +891,8 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
   // it is only used by trunc or trunc(lshr) operations.  If so, we split the
   // PHI into the various pieces being extracted.  This sort of thing is
   // introduced when SROA promotes an aggregate to a single large integer type.
-  if (PN.getType()->isIntegerTy() && DL &&
-      !DL->isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
+  if (PN.getType()->isIntegerTy() &&
+      !DL.isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
     if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
       return Res;
 
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index dd0e65f..b28611f 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -312,9 +312,9 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
 /// SimplifyWithOpReplaced - See if V simplifies when its operand Op is
 /// replaced with RepOp.
 static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
-                                     const DataLayout *TD,
                                      const TargetLibraryInfo *TLI,
-                                     DominatorTree *DT, AssumptionCache *AC) {
+                                     const DataLayout &DL, DominatorTree *DT,
+                                     AssumptionCache *AC) {
   // Trivial replacement.
   if (V == Op)
     return RepOp;
@@ -326,18 +326,18 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   // If this is a binary operator, try to simplify it with the replaced op.
   if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) {
     if (B->getOperand(0) == Op)
-      return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), TD, TLI);
+      return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), DL, TLI);
     if (B->getOperand(1) == Op)
-      return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, TD, TLI);
+      return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, DL, TLI);
   }
 
   // Same for CmpInsts.
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
     if (C->getOperand(0) == Op)
-      return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD,
+      return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), DL,
                              TLI, DT, AC);
     if (C->getOperand(1) == Op)
-      return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD,
+      return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, DL,
                              TLI, DT, AC);
   }
 
@@ -361,14 +361,14 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
     if (ConstOps.size() == I->getNumOperands()) {
       if (CmpInst *C = dyn_cast<CmpInst>(I))
         return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0],
-                                               ConstOps[1], TD, TLI);
+                                               ConstOps[1], DL, TLI);
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I))
         if (!LI->isVolatile())
-          return ConstantFoldLoadFromConstPtr(ConstOps[0], TD);
+          return ConstantFoldLoadFromConstPtr(ConstOps[0], DL);
 
-      return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
-                                      ConstOps, TD, TLI);
+      return ConstantFoldInstOperands(I->getOpcode(), I->getType(), ConstOps,
+                                      DL, TLI);
     }
   }
 
@@ -635,25 +635,25 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
   // arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) ==
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) ==
             TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) ==
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) ==
             TrueVal)
       return ReplaceInstUsesWith(SI, FalseVal);
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) ==
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) ==
             FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) ==
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) ==
             FalseVal)
       return ReplaceInstUsesWith(SI, FalseVal);
   } else if (Pred == ICmpInst::ICMP_NE) {
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) ==
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) ==
             FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) ==
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) ==
             FalseVal)
       return ReplaceInstUsesWith(SI, TrueVal);
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI, DT, AC) ==
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TLI, DL, DT, AC) ==
             TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI, DT, AC) ==
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TLI, DL, DT, AC) ==
             TrueVal)
       return ReplaceInstUsesWith(SI, TrueVal);
   }
@@ -927,7 +927,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       return BinaryOperator::CreateAnd(NotCond, FalseVal);
     }
     if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
-      if (C->getZExtValue() == false) {
+      if (!C->getZExtValue()) {
         // Change: A = select B, C, false --> A = and B, C
         return BinaryOperator::CreateAnd(CondVal, TrueVal);
       }
@@ -1203,37 +1203,41 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         return NV;
 
   if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
-    // select(C, select(C, a, b), c) -> select(C, a, c)
-    if (TrueSI->getCondition() == CondVal) {
-      if (SI.getTrueValue() == TrueSI->getTrueValue())
-        return nullptr;
-      SI.setOperand(1, TrueSI->getTrueValue());
-      return &SI;
-    }
-    // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b)
-    // We choose this as normal form to enable folding on the And and shortening
-    // paths for the values (this helps GetUnderlyingObjects() for example).
-    if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
-      Value *And = Builder->CreateAnd(CondVal, TrueSI->getCondition());
-      SI.setOperand(0, And);
-      SI.setOperand(1, TrueSI->getTrueValue());
-      return &SI;
+    if (TrueSI->getCondition()->getType() == CondVal->getType()) {
+      // select(C, select(C, a, b), c) -> select(C, a, c)
+      if (TrueSI->getCondition() == CondVal) {
+        if (SI.getTrueValue() == TrueSI->getTrueValue())
+          return nullptr;
+        SI.setOperand(1, TrueSI->getTrueValue());
+        return &SI;
+      }
+      // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b)
+      // We choose this as normal form to enable folding on the And and shortening
+      // paths for the values (this helps GetUnderlyingObjects() for example).
+      if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
+        Value *And = Builder->CreateAnd(CondVal, TrueSI->getCondition());
+        SI.setOperand(0, And);
+        SI.setOperand(1, TrueSI->getTrueValue());
+        return &SI;
+      }
     }
   }
   if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) {
-    // select(C, a, select(C, b, c)) -> select(C, a, c)
-    if (FalseSI->getCondition() == CondVal) {
-      if (SI.getFalseValue() == FalseSI->getFalseValue())
-        return nullptr;
-      SI.setOperand(2, FalseSI->getFalseValue());
-      return &SI;
-    }
-    // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
-    if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
-      Value *Or = Builder->CreateOr(CondVal, FalseSI->getCondition());
-      SI.setOperand(0, Or);
-      SI.setOperand(2, FalseSI->getFalseValue());
-      return &SI;
+    if (FalseSI->getCondition()->getType() == CondVal->getType()) {
+      // select(C, a, select(C, b, c)) -> select(C, a, c)
+      if (FalseSI->getCondition() == CondVal) {
+        if (SI.getFalseValue() == FalseSI->getFalseValue())
+          return nullptr;
+        SI.setOperand(2, FalseSI->getFalseValue());
+        return &SI;
+      }
+      // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
+      if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
+        Value *Or = Builder->CreateOr(CondVal, FalseSI->getCondition());
+        SI.setOperand(0, Or);
+        SI.setOperand(2, FalseSI->getFalseValue());
+        return &SI;
+      }
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index b4976e0..a414ec6 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -187,7 +187,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
 /// GetShiftedValue - When CanEvaluateShifted returned true for an expression,
 /// this value inserts the new computation that produces the shifted value.
 static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
-                              InstCombiner &IC) {
+                              InstCombiner &IC, const DataLayout &DL) {
   // We can always evaluate constants shifted.
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (isLeftShift)
@@ -196,8 +196,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       V = IC.Builder->CreateLShr(C, NumBits);
     // If we got a constantexpr back, try to simplify it with TD info.
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
-      V = ConstantFoldConstantExpression(CE, IC.getDataLayout(),
-                                         IC.getTargetLibraryInfo());
+      V = ConstantFoldConstantExpression(CE, DL, IC.getTargetLibraryInfo());
     return V;
   }
 
@@ -210,8 +209,10 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
   case Instruction::Or:
   case Instruction::Xor:
     // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
-    I->setOperand(0, GetShiftedValue(I->getOperand(0), NumBits,isLeftShift,IC));
-    I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
+    I->setOperand(
+        0, GetShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL));
+    I->setOperand(
+        1, GetShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
     return I;
 
   case Instruction::Shl: {
@@ -297,8 +298,10 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
   }
 
   case Instruction::Select:
-    I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
-    I->setOperand(2, GetShiftedValue(I->getOperand(2), NumBits,isLeftShift,IC));
+    I->setOperand(
+        1, GetShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+    I->setOperand(
+        2, GetShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL));
     return I;
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -306,8 +309,8 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      PN->setIncomingValue(i, GetShiftedValue(PN->getIncomingValue(i),
-                                              NumBits, isLeftShift, IC));
+      PN->setIncomingValue(i, GetShiftedValue(PN->getIncomingValue(i), NumBits,
+                                              isLeftShift, IC, DL));
     return PN;
   }
   }
@@ -337,8 +340,8 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
     DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
               " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
 
-    return ReplaceInstUsesWith(I,
-                 GetShiftedValue(Op0, COp1->getZExtValue(), isLeftShift, *this));
+    return ReplaceInstUsesWith(
+        I, GetShiftedValue(Op0, COp1->getZExtValue(), isLeftShift, *this, DL));
   }
 
   // See if we can simplify any instructions used by the instruction whose sole
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index c5603aa..cd391d0 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 
@@ -70,8 +69,8 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
   APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
 
-  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask,
-                                     KnownZero, KnownOne, 0, &Inst);
+  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, KnownZero, KnownOne,
+                                     0, &Inst);
   if (!V) return false;
   if (V == &Inst) return true;
   ReplaceInstUsesWith(Inst, V);
@@ -84,9 +83,9 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
 bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask,
                                         APInt &KnownZero, APInt &KnownOne,
                                         unsigned Depth) {
-  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask,
-                                          KnownZero, KnownOne, Depth,
-                                          dyn_cast<Instruction>(U.getUser()));
+  Value *NewVal =
+      SimplifyDemandedUseBits(U.get(), DemandedMask, KnownZero, KnownOne, Depth,
+                              dyn_cast<Instruction>(U.getUser()));
   if (!NewVal) return false;
   U = NewVal;
   return true;
@@ -122,15 +121,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   assert(Depth <= 6 && "Limit Search Depth");
   uint32_t BitWidth = DemandedMask.getBitWidth();
   Type *VTy = V->getType();
-  assert((DL || !VTy->isPointerTy()) &&
-         "SimplifyDemandedBits needs to know bit widths!");
-  assert((!DL || DL->getTypeSizeInBits(VTy->getScalarType()) == BitWidth) &&
-         (!VTy->isIntOrIntVectorTy() ||
-          VTy->getScalarSizeInBits() == BitWidth) &&
-         KnownZero.getBitWidth() == BitWidth &&
-         KnownOne.getBitWidth() == BitWidth &&
-         "Value *V, DemandedMask, KnownZero and KnownOne "
-         "must have same BitWidth");
+  assert(
+      (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) &&
+      KnownZero.getBitWidth() == BitWidth &&
+      KnownOne.getBitWidth() == BitWidth &&
+      "Value *V, DemandedMask, KnownZero and KnownOne "
+      "must have same BitWidth");
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     // We know all of the bits for a constant!
     KnownOne = CI->getValue() & DemandedMask;
@@ -174,9 +170,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // this instruction has a simpler value in that context.
     if (I->getOpcode() == Instruction::And) {
       // If either the LHS or the RHS are Zero, the result is zero.
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1,
+      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
                        CxtI);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1,
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
                        CxtI);
 
       // If all of the demanded bits are known 1 on one side, return the other.
@@ -198,9 +194,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // only bits from X or Y are demanded.
 
       // If either the LHS or the RHS are One, the result is One.
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1,
+      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
                        CxtI);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1,
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
                        CxtI);
 
       // If all of the demanded bits are known zero on one side, return the
@@ -225,9 +221,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // We can simplify (X^Y) -> X or Y in the user's context if we know that
       // only bits from X or Y are demanded.
 
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1,
+      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
                        CxtI);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1,
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
                        CxtI);
 
       // If all of the demanded bits are known zero on one side, return the
@@ -256,10 +252,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   case Instruction::And:
     // If either the LHS or the RHS are Zero, the result is zero.
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
-                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
+                             RHSKnownOne, Depth + 1) ||
         SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero,
-                             LHSKnownZero, LHSKnownOne, Depth+1))
+                             LHSKnownZero, LHSKnownOne, Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -294,10 +290,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   case Instruction::Or:
     // If either the LHS or the RHS are One, the result is One.
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
-                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
+                             RHSKnownOne, Depth + 1) ||
         SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne,
-                             LHSKnownZero, LHSKnownOne, Depth+1))
+                             LHSKnownZero, LHSKnownOne, Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -336,10 +332,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     KnownOne = RHSKnownOne | LHSKnownOne;
     break;
   case Instruction::Xor: {
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
-                             RHSKnownZero, RHSKnownOne, Depth+1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
-                             LHSKnownZero, LHSKnownOne, Depth+1))
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
+                             RHSKnownOne, Depth + 1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, LHSKnownZero,
+                             LHSKnownOne, Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -423,10 +419,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   }
   case Instruction::Select:
-    if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask,
-                             RHSKnownZero, RHSKnownOne, Depth+1) ||
-        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
-                             LHSKnownZero, LHSKnownOne, Depth+1))
+    if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero,
+                             RHSKnownOne, Depth + 1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero,
+                             LHSKnownOne, Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -445,8 +441,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     DemandedMask = DemandedMask.zext(truncBf);
     KnownZero = KnownZero.zext(truncBf);
     KnownOne = KnownOne.zext(truncBf);
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
-                             KnownZero, KnownOne, Depth+1))
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
+                             KnownOne, Depth + 1))
       return I;
     DemandedMask = DemandedMask.trunc(BitWidth);
     KnownZero = KnownZero.trunc(BitWidth);
@@ -471,8 +467,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Don't touch a vector-to-scalar bitcast.
       return nullptr;
 
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
-                             KnownZero, KnownOne, Depth+1))
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
+                             KnownOne, Depth + 1))
       return I;
     assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
     break;
@@ -483,8 +479,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     DemandedMask = DemandedMask.trunc(SrcBitWidth);
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
-                             KnownZero, KnownOne, Depth+1))
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
+                             KnownOne, Depth + 1))
       return I;
     DemandedMask = DemandedMask.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
@@ -510,8 +506,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth);
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits,
-                             KnownZero, KnownOne, Depth+1))
+    if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits, KnownZero,
+                             KnownOne, Depth + 1))
       return I;
     InputDemandedBits = InputDemandedBits.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
@@ -552,7 +548,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
       // Find information about known zero/one bits in the input.
       if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits,
-                               LHSKnownZero, LHSKnownOne, Depth+1))
+                               LHSKnownZero, LHSKnownOne, Depth + 1))
         return I;
 
       // If the RHS of the add has bits set that can't affect the input, reduce
@@ -602,9 +598,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // significant bit and all those below it.
         APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
         if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
-                                 LHSKnownZero, LHSKnownOne, Depth+1) ||
+                                 LHSKnownZero, LHSKnownOne, Depth + 1) ||
             SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
-                                 LHSKnownZero, LHSKnownOne, Depth+1))
+                                 LHSKnownZero, LHSKnownOne, Depth + 1))
           return I;
       }
     }
@@ -619,9 +615,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       uint32_t NLZ = DemandedMask.countLeadingZeros();
       APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
       if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
-                               LHSKnownZero, LHSKnownOne, Depth+1) ||
+                               LHSKnownZero, LHSKnownOne, Depth + 1) ||
           SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
-                               LHSKnownZero, LHSKnownOne, Depth+1))
+                               LHSKnownZero, LHSKnownOne, Depth + 1))
         return I;
     }
 
@@ -662,8 +658,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       else if (IOp->hasNoUnsignedWrap())
         DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
-                               KnownZero, KnownOne, Depth+1))
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
+                               KnownOne, Depth + 1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
       KnownZero <<= ShiftAmt;
@@ -686,8 +682,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if (cast<LShrOperator>(I)->isExact())
         DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
-                               KnownZero, KnownOne, Depth+1))
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
+                               KnownOne, Depth + 1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
       KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
@@ -731,8 +727,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if (cast<AShrOperator>(I)->isExact())
         DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
-                               KnownZero, KnownOne, Depth+1))
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
+                               KnownOne, Depth + 1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
       // Compute the new bits that are at the top now.
@@ -772,8 +768,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
         APInt LowBits = RA - 1;
         APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
-        if (SimplifyDemandedBits(I->getOperandUse(0), Mask2,
-                                 LHSKnownZero, LHSKnownOne, Depth+1))
+        if (SimplifyDemandedBits(I->getOperandUse(0), Mask2, LHSKnownZero,
+                                 LHSKnownOne, Depth + 1))
           return I;
 
         // The low bits of LHS are unchanged by the srem.
@@ -798,7 +794,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // remainder is zero.
     if (DemandedMask.isNegative() && KnownZero.isNonNegative()) {
       APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1,
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
                        CxtI);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
@@ -808,10 +804,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   case Instruction::URem: {
     APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
     APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes,
-                             KnownZero2, KnownOne2, Depth+1) ||
-        SimplifyDemandedBits(I->getOperandUse(1), AllOnes,
-                             KnownZero2, KnownOne2, Depth+1))
+    if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes, KnownZero2,
+                             KnownOne2, Depth + 1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), AllOnes, KnownZero2,
+                             KnownOne2, Depth + 1))
       return I;
 
     unsigned Leaders = KnownZero2.countLeadingOnes();
@@ -1051,7 +1047,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       // Note that we can't propagate undef elt info, because we don't know
       // which elt is getting updated.
       TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
-                                        UndefElts2, Depth+1);
+                                        UndefElts2, Depth + 1);
       if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
       break;
     }
@@ -1069,7 +1065,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     APInt DemandedElts2 = DemandedElts;
     DemandedElts2.clearBit(IdxNo);
     TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2,
-                                      UndefElts, Depth+1);
+                                      UndefElts, Depth + 1);
     if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
 
     // The inserted element is defined.
@@ -1097,12 +1093,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
     APInt UndefElts4(LHSVWidth, 0);
     TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded,
-                                      UndefElts4, Depth+1);
+                                      UndefElts4, Depth + 1);
     if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
 
     APInt UndefElts3(LHSVWidth, 0);
     TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded,
-                                      UndefElts3, Depth+1);
+                                      UndefElts3, Depth + 1);
     if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
 
     bool NewUndefElts = false;
@@ -1152,12 +1148,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       }
     }
 
-    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), LeftDemanded,
-                                      UndefElts, Depth+1);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), LeftDemanded, UndefElts,
+                                      Depth + 1);
     if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
 
     TmpV = SimplifyDemandedVectorElts(I->getOperand(2), RightDemanded,
-                                      UndefElts2, Depth+1);
+                                      UndefElts2, Depth + 1);
     if (TmpV) { I->setOperand(2, TmpV); MadeChange = true; }
 
     // Output elements are undefined if both are undefined.
@@ -1204,7 +1200,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
     // div/rem demand all inputs, because they don't want divide by zero.
     TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts,
-                                      UndefElts2, Depth+1);
+                                      UndefElts2, Depth + 1);
     if (TmpV) {
       I->setOperand(0, TmpV);
       MadeChange = true;
@@ -1238,11 +1234,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   case Instruction::Sub:
   case Instruction::Mul:
     // div/rem demand all inputs, because they don't want divide by zero.
-    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
-                                      UndefElts, Depth+1);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, UndefElts,
+                                      Depth + 1);
     if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
     TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts,
-                                      UndefElts2, Depth+1);
+                                      UndefElts2, Depth + 1);
     if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
 
     // Output elements are undefined if both are undefined.  Consider things
@@ -1251,8 +1247,8 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     break;
   case Instruction::FPTrunc:
   case Instruction::FPExt:
-    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
-                                      UndefElts, Depth+1);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, UndefElts,
+                                      Depth + 1);
     if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
     break;
 
@@ -1273,10 +1269,10 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     case Intrinsic::x86_sse2_min_sd:
     case Intrinsic::x86_sse2_max_sd:
       TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
-                                        UndefElts, Depth+1);
+                                        UndefElts, Depth + 1);
       if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
       TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
-                                        UndefElts2, Depth+1);
+                                        UndefElts2, Depth + 1);
       if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
 
       // If only the low elt is demanded and this is a scalarizable intrinsic,
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index e07efb5..b6beb65 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -202,8 +202,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       APInt UndefElts(VectorWidth, 0);
       APInt DemandedMask(VectorWidth, 0);
       DemandedMask.setBit(IndexVal);
-      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
-                                                DemandedMask, UndefElts)) {
+      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), DemandedMask,
+                                                UndefElts)) {
         EI.setOperand(0, V);
         return &EI;
       }
@@ -733,7 +733,8 @@ static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) {
     case Instruction::GetElementPtr: {
       Value *Ptr = NewOps[0];
       ArrayRef<Value*> Idx = NewOps.slice(1);
-      GetElementPtrInst *GEP = GetElementPtrInst::Create(Ptr, Idx, "", I);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I);
       GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
       return GEP;
     }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 88fcd53..90551e4 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -57,6 +57,7 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -75,7 +76,7 @@ STATISTIC(NumFactor   , "Number of factorizations");
 STATISTIC(NumReassoc  , "Number of reassociations");
 
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
-  return llvm::EmitGEPOffset(Builder, *getDataLayout(), GEP);
+  return llvm::EmitGEPOffset(Builder, DL, GEP);
 }
 
 /// ShouldChangeType - Return true if it is desirable to convert a computation
@@ -84,13 +85,10 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
 bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
   assert(From->isIntegerTy() && To->isIntegerTy());
 
-  // If we don't have DL, we don't know if the source/dest are legal.
-  if (!DL) return false;
-
   unsigned FromWidth = From->getPrimitiveSizeInBits();
   unsigned ToWidth = To->getPrimitiveSizeInBits();
-  bool FromLegal = DL->isLegalInteger(FromWidth);
-  bool ToLegal = DL->isLegalInteger(ToWidth);
+  bool FromLegal = DL.isLegalInteger(FromWidth);
+  bool ToLegal = DL.isLegalInteger(ToWidth);
 
   // If this is a legal integer from type, and the result would be an illegal
   // type, don't do the transformation.
@@ -445,7 +443,7 @@ getBinOpsForFactorization(Instruction::BinaryOps TopLevelOpcode,
 /// This tries to simplify binary operations by factorizing out common terms
 /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
 static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
-                               const DataLayout *DL, BinaryOperator &I,
+                               const DataLayout &DL, BinaryOperator &I,
                                Instruction::BinaryOps InnerOpcode, Value *A,
                                Value *B, Value *C, Value *D) {
 
@@ -872,12 +870,9 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
 /// will land us at the specified offset.  If so, fill them into NewIndices and
 /// return the resultant element type, otherwise return null.
 Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
-                                        SmallVectorImpl<Value*> &NewIndices) {
+                                        SmallVectorImpl<Value *> &NewIndices) {
   assert(PtrTy->isPtrOrPtrVectorTy());
 
-  if (!DL)
-    return nullptr;
-
   Type *Ty = PtrTy->getPointerElementType();
   if (!Ty->isSized())
     return nullptr;
@@ -885,9 +880,9 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
-  Type *IntPtrTy = DL->getIntPtrType(PtrTy);
+  Type *IntPtrTy = DL.getIntPtrType(PtrTy);
   int64_t FirstIdx = 0;
-  if (int64_t TySize = DL->getTypeAllocSize(Ty)) {
+  if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
     Offset -= FirstIdx*TySize;
 
@@ -905,11 +900,11 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
   // Index into the types.  If we fail, set OrigBase to null.
   while (Offset) {
     // Indexing into tail padding between struct/array elements.
-    if (uint64_t(Offset*8) >= DL->getTypeSizeInBits(Ty))
+    if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty))
       return nullptr;
 
     if (StructType *STy = dyn_cast<StructType>(Ty)) {
-      const StructLayout *SL = DL->getStructLayout(STy);
+      const StructLayout *SL = DL.getStructLayout(STy);
       assert(Offset < (int64_t)SL->getSizeInBytes() &&
              "Offset must stay within the indexed type");
 
@@ -920,7 +915,7 @@ Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
       Offset -= SL->getElementOffset(Elt);
       Ty = STy->getElementType(Elt);
     } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
-      uint64_t EltSize = DL->getTypeAllocSize(AT->getElementType());
+      uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
       assert(EltSize && "Cannot index into a zero-sized array");
       NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
       Offset %= EltSize;
@@ -1214,7 +1209,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
   // It may not be safe to reorder shuffles and things like div, urem, etc.
   // because we may trap when executing those ops on unknown vector elements.
   // See PR20059.
-  if (!isSafeToSpeculativelyExecute(&Inst, DL)) return nullptr;
+  if (!isSafeToSpeculativelyExecute(&Inst))
+    return nullptr;
 
   unsigned VWidth = cast<VectorType>(Inst.getType())->getNumElements();
   Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1);
@@ -1300,37 +1296,37 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
   // Eliminate unneeded casts for indices, and replace indices which displace
   // by multiples of a zero size type with zero.
-  if (DL) {
-    bool MadeChange = false;
-    Type *IntPtrTy = DL->getIntPtrType(GEP.getPointerOperandType());
-
-    gep_type_iterator GTI = gep_type_begin(GEP);
-    for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end();
-         I != E; ++I, ++GTI) {
-      // Skip indices into struct types.
-      SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI);
-      if (!SeqTy) continue;
-
-      // If the element type has zero size then any index over it is equivalent
-      // to an index of zero, so replace it with zero if it is not zero already.
-      if (SeqTy->getElementType()->isSized() &&
-          DL->getTypeAllocSize(SeqTy->getElementType()) == 0)
-        if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) {
-          *I = Constant::getNullValue(IntPtrTy);
-          MadeChange = true;
-        }
+  bool MadeChange = false;
+  Type *IntPtrTy = DL.getIntPtrType(GEP.getPointerOperandType());
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
+       ++I, ++GTI) {
+    // Skip indices into struct types.
+    SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI);
+    if (!SeqTy)
+      continue;
 
-      Type *IndexTy = (*I)->getType();
-      if (IndexTy != IntPtrTy) {
-        // If we are using a wider index than needed for this platform, shrink
-        // it to what we need.  If narrower, sign-extend it to what we need.
-        // This explicit cast can make subsequent optimizations more obvious.
-        *I = Builder->CreateIntCast(*I, IntPtrTy, true);
+    // If the element type has zero size then any index over it is equivalent
+    // to an index of zero, so replace it with zero if it is not zero already.
+    if (SeqTy->getElementType()->isSized() &&
+        DL.getTypeAllocSize(SeqTy->getElementType()) == 0)
+      if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) {
+        *I = Constant::getNullValue(IntPtrTy);
         MadeChange = true;
       }
+
+    Type *IndexTy = (*I)->getType();
+    if (IndexTy != IntPtrTy) {
+      // If we are using a wider index than needed for this platform, shrink
+      // it to what we need.  If narrower, sign-extend it to what we need.
+      // This explicit cast can make subsequent optimizations more obvious.
+      *I = Builder->CreateIntCast(*I, IntPtrTy, true);
+      MadeChange = true;
     }
-    if (MadeChange) return &GEP;
   }
+  if (MadeChange)
+    return &GEP;
 
   // Check to see if the inputs to the PHI node are getelementptr instructions.
   if (PHINode *PN = dyn_cast<PHINode>(PtrOp)) {
@@ -1338,6 +1334,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (!Op1)
       return nullptr;
 
+    // Don't fold a GEP into itself through a PHI node. This can only happen
+    // through the back-edge of a loop. Folding a GEP into itself means that
+    // the value of the previous iteration needs to be stored in the meantime,
+    // thus requiring an additional register variable to be live, but not
+    // actually achieving anything (the GEP still needs to be executed once per
+    // loop iteration).
+    if (Op1 == &GEP)
+      return nullptr;
+
     signed DI = -1;
 
     for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
@@ -1345,6 +1350,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
         return nullptr;
 
+      // As for Op1 above, don't try to fold a GEP into itself.
+      if (Op2 == &GEP)
+        return nullptr;
+
       // Keep track of the type as we walk the GEP.
       Type *CurTy = Op1->getOperand(0)->getType()->getScalarType();
 
@@ -1481,19 +1490,22 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
 
     if (!Indices.empty())
-      return (GEP.isInBounds() && Src->isInBounds()) ?
-        GetElementPtrInst::CreateInBounds(Src->getOperand(0), Indices,
-                                          GEP.getName()) :
-        GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName());
+      return GEP.isInBounds() && Src->isInBounds()
+                 ? GetElementPtrInst::CreateInBounds(
+                       Src->getSourceElementType(), Src->getOperand(0), Indices,
+                       GEP.getName())
+                 : GetElementPtrInst::Create(Src->getSourceElementType(),
+                                             Src->getOperand(0), Indices,
+                                             GEP.getName());
   }
 
-  if (DL && GEP.getNumIndices() == 1) {
+  if (GEP.getNumIndices() == 1) {
     unsigned AS = GEP.getPointerAddressSpace();
     if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
-        DL->getPointerSizeInBits(AS)) {
+        DL.getPointerSizeInBits(AS)) {
       Type *PtrTy = GEP.getPointerOperandType();
       Type *Ty = PtrTy->getPointerElementType();
-      uint64_t TyAllocSize = DL->getTypeAllocSize(Ty);
+      uint64_t TyAllocSize = DL.getTypeAllocSize(Ty);
 
       bool Matched = false;
       uint64_t C;
@@ -1562,8 +1574,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         if (CATy->getElementType() == StrippedPtrTy->getElementType()) {
           // -> GEP i8* X, ...
           SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end());
-          GetElementPtrInst *Res =
-            GetElementPtrInst::Create(StrippedPtr, Idx, GEP.getName());
+          GetElementPtrInst *Res = GetElementPtrInst::Create(
+              StrippedPtrTy->getElementType(), StrippedPtr, Idx, GEP.getName());
           Res->setIsInBounds(GEP.isInBounds());
           if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace())
             return Res;
@@ -1599,9 +1611,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // %0 = GEP [10 x i8] addrspace(1)* X, ...
             // addrspacecast i8 addrspace(1)* %0 to i8*
             SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end());
-            Value *NewGEP = GEP.isInBounds() ?
-              Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) :
-              Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
+            Value *NewGEP =
+                GEP.isInBounds()
+                    ? Builder->CreateInBoundsGEP(StrippedPtr, Idx,
+                                                 GEP.getName())
+                    : Builder->CreateGEP(StrippedPtrTy->getElementType(),
+                                         StrippedPtr, Idx, GEP.getName());
             return new AddrSpaceCastInst(NewGEP, GEP.getType());
           }
         }
@@ -1612,14 +1627,16 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
       Type *SrcElTy = StrippedPtrTy->getElementType();
       Type *ResElTy = PtrOp->getType()->getPointerElementType();
-      if (DL && SrcElTy->isArrayTy() &&
-          DL->getTypeAllocSize(SrcElTy->getArrayElementType()) ==
-          DL->getTypeAllocSize(ResElTy)) {
-        Type *IdxType = DL->getIntPtrType(GEP.getType());
+      if (SrcElTy->isArrayTy() &&
+          DL.getTypeAllocSize(SrcElTy->getArrayElementType()) ==
+              DL.getTypeAllocSize(ResElTy)) {
+        Type *IdxType = DL.getIntPtrType(GEP.getType());
         Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
-        Value *NewGEP = GEP.isInBounds() ?
-          Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) :
-          Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
+        Value *NewGEP =
+            GEP.isInBounds()
+                ? Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName())
+                : Builder->CreateGEP(StrippedPtrTy->getElementType(),
+                                     StrippedPtr, Idx, GEP.getName());
 
         // V and GEP are both pointer types --> BitCast
         return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
@@ -1630,11 +1647,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // %V = mul i64 %N, 4
       // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
       // into:  %t1 = getelementptr i32* %arr, i32 %N; bitcast
-      if (DL && ResElTy->isSized() && SrcElTy->isSized()) {
+      if (ResElTy->isSized() && SrcElTy->isSized()) {
         // Check that changing the type amounts to dividing the index by a scale
         // factor.
-        uint64_t ResSize = DL->getTypeAllocSize(ResElTy);
-        uint64_t SrcSize = DL->getTypeAllocSize(SrcElTy);
+        uint64_t ResSize = DL.getTypeAllocSize(ResElTy);
+        uint64_t SrcSize = DL.getTypeAllocSize(SrcElTy);
         if (ResSize && SrcSize % ResSize == 0) {
           Value *Idx = GEP.getOperand(1);
           unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
@@ -1642,7 +1659,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == DL->getIntPtrType(GEP.getType()) &&
+          assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1650,9 +1667,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
             // If the multiplication NewIdx * Scale may overflow then the new
             // GEP may not be "inbounds".
-            Value *NewGEP = GEP.isInBounds() && NSW ?
-              Builder->CreateInBoundsGEP(StrippedPtr, NewIdx, GEP.getName()) :
-              Builder->CreateGEP(StrippedPtr, NewIdx, GEP.getName());
+            Value *NewGEP =
+                GEP.isInBounds() && NSW
+                    ? Builder->CreateInBoundsGEP(StrippedPtr, NewIdx,
+                                                 GEP.getName())
+                    : Builder->CreateGEP(StrippedPtrTy->getElementType(),
+                                         StrippedPtr, NewIdx, GEP.getName());
 
             // The NewGEP must be pointer typed, so must the old one -> BitCast
             return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
@@ -1665,13 +1685,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
       //   (where tmp = 8*tmp2) into:
       // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
-      if (DL && ResElTy->isSized() && SrcElTy->isSized() &&
-          SrcElTy->isArrayTy()) {
+      if (ResElTy->isSized() && SrcElTy->isSized() && SrcElTy->isArrayTy()) {
         // Check that changing to the array element type amounts to dividing the
         // index by a scale factor.
-        uint64_t ResSize = DL->getTypeAllocSize(ResElTy);
-        uint64_t ArrayEltSize
-          = DL->getTypeAllocSize(SrcElTy->getArrayElementType());
+        uint64_t ResSize = DL.getTypeAllocSize(ResElTy);
+        uint64_t ArrayEltSize =
+            DL.getTypeAllocSize(SrcElTy->getArrayElementType());
         if (ResSize && ArrayEltSize % ResSize == 0) {
           Value *Idx = GEP.getOperand(1);
           unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
@@ -1679,7 +1698,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == DL->getIntPtrType(GEP.getType()) &&
+          assert(Idx->getType() == DL.getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1688,13 +1707,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // If the multiplication NewIdx * Scale may overflow then the new
             // GEP may not be "inbounds".
             Value *Off[2] = {
-              Constant::getNullValue(DL->getIntPtrType(GEP.getType())),
-              NewIdx
-            };
+                Constant::getNullValue(DL.getIntPtrType(GEP.getType())),
+                NewIdx};
 
             Value *NewGEP = GEP.isInBounds() && NSW ?
               Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) :
-              Builder->CreateGEP(StrippedPtr, Off, GEP.getName());
+              Builder->CreateGEP(SrcElTy, StrippedPtr, Off, GEP.getName());
             // The NewGEP must be pointer typed, so must the old one -> BitCast
             return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
                                                                  GEP.getType());
@@ -1704,9 +1722,6 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
   }
 
-  if (!DL)
-    return nullptr;
-
   // addrspacecast between types is canonicalized as a bitcast, then an
   // addrspacecast. To take advantage of the below bitcast + struct GEP, look
   // through the addrspacecast.
@@ -1727,10 +1742,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
     Value *Operand = BCI->getOperand(0);
     PointerType *OpType = cast<PointerType>(Operand->getType());
-    unsigned OffsetBits = DL->getPointerTypeSizeInBits(GEP.getType());
+    unsigned OffsetBits = DL.getPointerTypeSizeInBits(GEP.getType());
     APInt Offset(OffsetBits, 0);
     if (!isa<BitCastInst>(Operand) &&
-        GEP.accumulateConstantOffset(*DL, Offset)) {
+        GEP.accumulateConstantOffset(DL, Offset)) {
 
       // If this GEP instruction doesn't move the pointer, just replace the GEP
       // with a bitcast of the real input to the dest type.
@@ -1761,7 +1776,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) {
         Value *NGEP = GEP.isInBounds() ?
           Builder->CreateInBoundsGEP(Operand, NewIndices) :
-          Builder->CreateGEP(Operand, NewIndices);
+          Builder->CreateGEP(OpType->getElementType(), Operand, NewIndices);
 
         if (NGEP->getType() == GEP.getType())
           return ReplaceInstUsesWith(GEP, NGEP);
@@ -2012,6 +2027,15 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
     return &BI;
   }
 
+  // If the condition is irrelevant, remove the use so that other
+  // transforms on the condition become more effective.
+  if (BI.isConditional() &&
+      BI.getSuccessor(0) == BI.getSuccessor(1) &&
+      !isa<UndefValue>(BI.getCondition())) {
+    BI.setCondition(UndefValue::get(BI.getCondition()->getType()));
+    return &BI;
+  }
+
   // Canonicalize fcmp_one -> fcmp_oeq
   FCmpInst::Predicate FPred; Value *Y;
   if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
@@ -2051,7 +2075,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   Value *Cond = SI.getCondition();
   unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  computeKnownBits(Cond, KnownZero, KnownOne);
+  computeKnownBits(Cond, KnownZero, KnownOne, 0, &SI);
   unsigned LeadingKnownZeros = KnownZero.countLeadingOnes();
   unsigned LeadingKnownOnes = KnownOne.countLeadingOnes();
 
@@ -2070,8 +2094,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   // x86 generates redundant zero-extenstion instructions if the operand is
   // truncated to i8 or i16.
   bool TruncCond = false;
-  if (DL && BitWidth > NewWidth &&
-      NewWidth >= DL->getLargestLegalIntTypeSize()) {
+  if (NewWidth > 0 && BitWidth > NewWidth &&
+      NewWidth >= DL.getLargestLegalIntTypeSize()) {
     TruncCond = true;
     IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
     Builder->SetInsertPoint(&SI);
@@ -2632,7 +2656,7 @@ bool InstCombiner::run() {
     }
 
     // Instruction isn't dead, see if we can constant propagate it.
-    if (!I->use_empty() && isa<Constant>(I->getOperand(0)))
+    if (!I->use_empty() && isa<Constant>(I->getOperand(0))) {
       if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
         DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
 
@@ -2643,6 +2667,7 @@ bool InstCombiner::run() {
         MadeIRChange = true;
         continue;
       }
+    }
 
     // See if we can trivially sink this instruction to a successor basic block.
     if (I->hasOneUse()) {
@@ -2756,10 +2781,9 @@ bool InstCombiner::run() {
 /// many instructions are dead or constant).  Additionally, if we find a branch
 /// whose condition is a known constant, we only visit the reachable successors.
 ///
-static bool AddReachableCodeToWorklist(BasicBlock *BB,
-                                       SmallPtrSetImpl<BasicBlock*> &Visited,
+static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
+                                       SmallPtrSetImpl<BasicBlock *> &Visited,
                                        InstCombineWorklist &ICWorklist,
-                                       const DataLayout *DL,
                                        const TargetLibraryInfo *TLI) {
   bool MadeIRChange = false;
   SmallVector<BasicBlock*, 256> Worklist;
@@ -2797,23 +2821,22 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
           continue;
         }
 
-      if (DL) {
-        // See if we can constant fold its operands.
-        for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end();
-             i != e; ++i) {
-          ConstantExpr *CE = dyn_cast<ConstantExpr>(i);
-          if (CE == nullptr) continue;
+      // See if we can constant fold its operands.
+      for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); i != e;
+           ++i) {
+        ConstantExpr *CE = dyn_cast<ConstantExpr>(i);
+        if (CE == nullptr)
+          continue;
 
-          Constant*& FoldRes = FoldedConstants[CE];
-          if (!FoldRes)
-            FoldRes = ConstantFoldConstantExpression(CE, DL, TLI);
-          if (!FoldRes)
-            FoldRes = CE;
+        Constant *&FoldRes = FoldedConstants[CE];
+        if (!FoldRes)
+          FoldRes = ConstantFoldConstantExpression(CE, DL, TLI);
+        if (!FoldRes)
+          FoldRes = CE;
 
-          if (FoldRes != CE) {
-            *i = FoldRes;
-            MadeIRChange = true;
-          }
+        if (FoldRes != CE) {
+          *i = FoldRes;
+          MadeIRChange = true;
         }
       }
 
@@ -2867,7 +2890,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
 ///
 /// This also does basic constant propagation and other forward fixing to make
 /// the combiner itself run much faster.
-static bool prepareICWorklistFromFunction(Function &F, const DataLayout *DL,
+static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
                                           TargetLibraryInfo *TLI,
                                           InstCombineWorklist &ICWorklist) {
   bool MadeIRChange = false;
@@ -2877,7 +2900,7 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout *DL,
   // track of which blocks we visit.
   SmallPtrSet<BasicBlock *, 64> Visited;
   MadeIRChange |=
-      AddReachableCodeToWorklist(F.begin(), Visited, ICWorklist, DL, TLI);
+      AddReachableCodeToWorklist(F.begin(), DL, Visited, ICWorklist, TLI);
 
   // Do a quick scan over the function.  If we find any blocks that are
   // unreachable, remove any instructions inside of them.  This prevents
@@ -2910,12 +2933,13 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout *DL,
   return MadeIRChange;
 }
 
-static bool combineInstructionsOverFunction(
-    Function &F, InstCombineWorklist &Worklist, AssumptionCache &AC,
-    TargetLibraryInfo &TLI, DominatorTree &DT, const DataLayout *DL = nullptr,
-    LoopInfo *LI = nullptr) {
+static bool
+combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
+                                AssumptionCache &AC, TargetLibraryInfo &TLI,
+                                DominatorTree &DT, LoopInfo *LI = nullptr) {
   // Minimizing size?
   bool MinimizeSize = F.hasFnAttribute(Attribute::MinSize);
+  auto &DL = F.getParent()->getDataLayout();
 
   /// Builder - This is an IRBuilder that automatically inserts new
   /// instructions into the worklist when they are created.
@@ -2950,15 +2974,13 @@ static bool combineInstructionsOverFunction(
 
 PreservedAnalyses InstCombinePass::run(Function &F,
                                        AnalysisManager<Function> *AM) {
-  auto *DL = F.getParent()->getDataLayout();
-
   auto &AC = AM->getResult<AssumptionAnalysis>(F);
   auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
 
   auto *LI = AM->getCachedResult<LoopAnalysis>(F);
 
-  if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, DL, LI))
+  if (!combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
@@ -3007,12 +3029,10 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   // Optional analyses.
-  auto *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  auto *DL = DLP ? &DLP->getDataLayout() : nullptr;
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
-  return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, DL, LI);
+  return combineInstructionsOverFunction(F, Worklist, AC, TLI, DT, LI);
 }
 
 char InstructionCombiningPass::ID = 0;
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 882aab0..978c857 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -24,6 +24,9 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
@@ -43,12 +46,14 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 #include <string>
 #include <system_error>
@@ -70,17 +75,15 @@ static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
 static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
 
-static const size_t kMinStackMallocSize = 1 << 6;  // 64B
+static const size_t kMinStackMallocSize = 1 << 6;   // 64B
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
 static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
 
 static const char *const kAsanModuleCtorName = "asan.module_ctor";
 static const char *const kAsanModuleDtorName = "asan.module_dtor";
-static const uint64_t    kAsanCtorAndDtorPriority = 1;
+static const uint64_t kAsanCtorAndDtorPriority = 1;
 static const char *const kAsanReportErrorTemplate = "__asan_report_";
-static const char *const kAsanReportLoadN = "__asan_report_load_n";
-static const char *const kAsanReportStoreN = "__asan_report_store_n";
 static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *const kAsanUnregisterGlobalsName =
     "__asan_unregister_globals";
@@ -90,7 +93,7 @@ static const char *const kAsanInitName = "__asan_init_v5";
 static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
 static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
 static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
-static const int         kMaxAsanStackMallocSizeClass = 10;
+static const int kMaxAsanStackMallocSizeClass = 10;
 static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
 static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
 static const char *const kAsanGenPrefix = "__asan_gen_";
@@ -103,10 +106,6 @@ static const char *const kAsanUnpoisonStackMemoryName =
 static const char *const kAsanOptionDetectUAR =
     "__asan_option_detect_stack_use_after_return";
 
-#ifndef NDEBUG
-static const int kAsanStackAfterReturnMagic = 0xf5;
-#endif
-
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
 static const size_t kNumberOfAccessSizes = 5;
 
@@ -120,84 +119,110 @@ static const unsigned kAsanAllocaPartialVal2 = 0x000000cbU;
 
 // This flag may need to be replaced with -f[no-]asan-reads.
 static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
-       cl::desc("instrument read instructions"), cl::Hidden, cl::init(true));
-static cl::opt<bool> ClInstrumentWrites("asan-instrument-writes",
-       cl::desc("instrument write instructions"), cl::Hidden, cl::init(true));
-static cl::opt<bool> ClInstrumentAtomics("asan-instrument-atomics",
-       cl::desc("instrument atomic instructions (rmw, cmpxchg)"),
-       cl::Hidden, cl::init(true));
-static cl::opt<bool> ClAlwaysSlowPath("asan-always-slow-path",
-       cl::desc("use instrumentation with slow path for all accesses"),
-       cl::Hidden, cl::init(false));
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentWrites(
+    "asan-instrument-writes", cl::desc("instrument write instructions"),
+    cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentAtomics(
+    "asan-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+static cl::opt<bool> ClAlwaysSlowPath(
+    "asan-always-slow-path",
+    cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
+    cl::init(false));
 // This flag limits the number of instructions to be instrumented
 // in any given BB. Normally, this should be set to unlimited (INT_MAX),
 // but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
 // set it to 10000.
-static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb",
-       cl::init(10000),
-       cl::desc("maximal number of instructions to instrument in any given BB"),
-       cl::Hidden);
+static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
+    "asan-max-ins-per-bb", cl::init(10000),
+    cl::desc("maximal number of instructions to instrument in any given BB"),
+    cl::Hidden);
 // This flag may need to be replaced with -f[no]asan-stack.
-static cl::opt<bool> ClStack("asan-stack",
-       cl::desc("Handle stack memory"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
+                             cl::Hidden, cl::init(true));
 static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
-       cl::desc("Check return-after-free"), cl::Hidden, cl::init(true));
+                                      cl::desc("Check return-after-free"),
+                                      cl::Hidden, cl::init(true));
 // This flag may need to be replaced with -f[no]asan-globals.
 static cl::opt<bool> ClGlobals("asan-globals",
-       cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
+                               cl::desc("Handle global objects"), cl::Hidden,
+                               cl::init(true));
 static cl::opt<bool> ClInitializers("asan-initialization-order",
-       cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(true));
-static cl::opt<bool> ClInvalidPointerPairs("asan-detect-invalid-pointer-pair",
-       cl::desc("Instrument <, <=, >, >=, - with pointer operands"),
-       cl::Hidden, cl::init(false));
-static cl::opt<unsigned> ClRealignStack("asan-realign-stack",
-       cl::desc("Realign stack to the value of this flag (power of two)"),
-       cl::Hidden, cl::init(32));
+                                    cl::desc("Handle C++ initializer order"),
+                                    cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInvalidPointerPairs(
+    "asan-detect-invalid-pointer-pair",
+    cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden,
+    cl::init(false));
+static cl::opt<unsigned> ClRealignStack(
+    "asan-realign-stack",
+    cl::desc("Realign stack to the value of this flag (power of two)"),
+    cl::Hidden, cl::init(32));
 static cl::opt<int> ClInstrumentationWithCallsThreshold(
     "asan-instrumentation-with-call-threshold",
-       cl::desc("If the function being instrumented contains more than "
-                "this number of memory accesses, use callbacks instead of "
-                "inline checks (-1 means never use callbacks)."),
-       cl::Hidden, cl::init(7000));
+    cl::desc(
+        "If the function being instrumented contains more than "
+        "this number of memory accesses, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(7000));
 static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
-       "asan-memory-access-callback-prefix",
-       cl::desc("Prefix for memory access callbacks"), cl::Hidden,
-       cl::init("__asan_"));
+    "asan-memory-access-callback-prefix",
+    cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+    cl::init("__asan_"));
 static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas",
-       cl::desc("instrument dynamic allocas"), cl::Hidden, cl::init(false));
+                                         cl::desc("instrument dynamic allocas"),
+                                         cl::Hidden, cl::init(false));
+static cl::opt<bool> ClSkipPromotableAllocas(
+    "asan-skip-promotable-allocas",
+    cl::desc("Do not instrument promotable allocas"), cl::Hidden,
+    cl::init(true));
 
 // These flags allow to change the shadow mapping.
 // The shadow mapping looks like
 //    Shadow = (Mem >> scale) + (1 << offset_log)
 static cl::opt<int> ClMappingScale("asan-mapping-scale",
-       cl::desc("scale of asan shadow mapping"), cl::Hidden, cl::init(0));
+                                   cl::desc("scale of asan shadow mapping"),
+                                   cl::Hidden, cl::init(0));
 
 // Optimization flags. Not user visible, used mostly for testing
 // and benchmarking the tool.
-static cl::opt<bool> ClOpt("asan-opt",
-       cl::desc("Optimize instrumentation"), cl::Hidden, cl::init(true));
-static cl::opt<bool> ClOptSameTemp("asan-opt-same-temp",
-       cl::desc("Instrument the same temp just once"), cl::Hidden,
-       cl::init(true));
+static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"),
+                           cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptSameTemp(
+    "asan-opt-same-temp", cl::desc("Instrument the same temp just once"),
+    cl::Hidden, cl::init(true));
 static cl::opt<bool> ClOptGlobals("asan-opt-globals",
-       cl::desc("Don't instrument scalar globals"), cl::Hidden, cl::init(true));
+                                  cl::desc("Don't instrument scalar globals"),
+                                  cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptStack(
+    "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"),
+    cl::Hidden, cl::init(false));
 
-static cl::opt<bool> ClCheckLifetime("asan-check-lifetime",
-       cl::desc("Use llvm.lifetime intrinsics to insert extra checks"),
-       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClCheckLifetime(
+    "asan-check-lifetime",
+    cl::desc("Use llvm.lifetime intrinsics to insert extra checks"), cl::Hidden,
+    cl::init(false));
 
 static cl::opt<bool> ClDynamicAllocaStack(
     "asan-stack-dynamic-alloca",
     cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
     cl::init(true));
 
+static cl::opt<uint32_t> ClForceExperiment(
+    "asan-force-experiment",
+    cl::desc("Force optimization experiment (for testing)"), cl::Hidden,
+    cl::init(0));
+
 // Debug flags.
 static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
                             cl::init(0));
 static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
                                  cl::Hidden, cl::init(0));
-static cl::opt<std::string> ClDebugFunc("asan-debug-func",
-                                        cl::Hidden, cl::desc("Debug func"));
+static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden,
+                                        cl::desc("Debug func"));
 static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
                                cl::Hidden, cl::init(-1));
 static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
@@ -207,10 +232,10 @@ STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
 STATISTIC(NumInstrumentedDynamicAllocas,
           "Number of instrumented dynamic allocas");
-STATISTIC(NumOptimizedAccessesToGlobalArray,
-          "Number of optimized accesses to global arrays");
 STATISTIC(NumOptimizedAccessesToGlobalVar,
           "Number of optimized accesses to global vars");
+STATISTIC(NumOptimizedAccessesToStackVar,
+          "Number of optimized accesses to stack vars");
 
 namespace {
 /// Frontend-provided metadata for source location.
@@ -238,9 +263,7 @@ struct LocationMetadata {
 class GlobalsMetadata {
  public:
   struct Entry {
-    Entry()
-        : SourceLoc(), Name(), IsDynInit(false),
-          IsBlacklisted(false) {}
+    Entry() : SourceLoc(), Name(), IsDynInit(false), IsBlacklisted(false) {}
     LocationMetadata SourceLoc;
     StringRef Name;
     bool IsDynInit;
@@ -249,19 +272,17 @@ class GlobalsMetadata {
 
   GlobalsMetadata() : inited_(false) {}
 
-  void init(Module& M) {
+  void init(Module &M) {
     assert(!inited_);
     inited_ = true;
     NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
-    if (!Globals)
-      return;
+    if (!Globals) return;
     for (auto MDN : Globals->operands()) {
       // Metadata node contains the global and the fields of "Entry".
       assert(MDN->getNumOperands() == 5);
       auto *GV = mdconst::extract_or_null<GlobalVariable>(MDN->getOperand(0));
       // The optimizer may optimize away a global entirely.
-      if (!GV)
-        continue;
+      if (!GV) continue;
       // We can already have an entry for GV if it was merged with another
       // global.
       Entry &E = Entries[GV];
@@ -286,7 +307,7 @@ class GlobalsMetadata {
 
  private:
   bool inited_;
-  DenseMap<GlobalVariable*, Entry> Entries;
+  DenseMap<GlobalVariable *, Entry> Entries;
 };
 
 /// This struct defines the shadow mapping using the rule:
@@ -371,17 +392,36 @@ struct AddressSanitizer : public FunctionPass {
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
-  void instrumentMop(Instruction *I, bool UseCalls);
+  uint64_t getAllocaSizeInBytes(AllocaInst *AI) const {
+    Type *Ty = AI->getAllocatedType();
+    uint64_t SizeInBytes =
+        AI->getModule()->getDataLayout().getTypeAllocSize(Ty);
+    return SizeInBytes;
+  }
+  /// Check if we want (and can) handle this alloca.
+  bool isInterestingAlloca(AllocaInst &AI) const;
+  /// If it is an interesting memory access, return the PointerOperand
+  /// and set IsWrite/Alignment. Otherwise return nullptr.
+  Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
+                                   uint64_t *TypeSize,
+                                   unsigned *Alignment) const;
+  void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, Instruction *I,
+                     bool UseCalls, const DataLayout &DL);
   void instrumentPointerComparisonOrSubtraction(Instruction *I);
   void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
                          Value *Addr, uint32_t TypeSize, bool IsWrite,
-                         Value *SizeArgument, bool UseCalls);
+                         Value *SizeArgument, bool UseCalls, uint32_t Exp);
+  void instrumentUnusualSizeOrAlignment(Instruction *I, Value *Addr,
+                                        uint32_t TypeSize, bool IsWrite,
+                                        Value *SizeArgument, bool UseCalls,
+                                        uint32_t Exp);
   Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                            Value *ShadowValue, uint32_t TypeSize);
   Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
                                  bool IsWrite, size_t AccessSizeIndex,
-                                 Value *SizeArgument);
+                                 Value *SizeArgument, uint32_t Exp);
   void instrumentMemIntrinsic(MemIntrinsic *MI);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
   bool runOnFunction(Function &F) override;
@@ -396,9 +436,10 @@ struct AddressSanitizer : public FunctionPass {
 
   bool LooksLikeCodeInBug11395(Instruction *I);
   bool GlobalIsLinkerInitialized(GlobalVariable *G);
+  bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,
+                    uint64_t TypeSize) const;
 
   LLVMContext *C;
-  const DataLayout *DL;
   Triple TargetTriple;
   int LongSize;
   Type *IntptrTy;
@@ -408,12 +449,12 @@ struct AddressSanitizer : public FunctionPass {
   Function *AsanInitFunction;
   Function *AsanHandleNoReturnFunc;
   Function *AsanPtrCmpFunction, *AsanPtrSubFunction;
-  // This array is indexed by AccessIsWrite and log2(AccessSize).
-  Function *AsanErrorCallback[2][kNumberOfAccessSizes];
-  Function *AsanMemoryAccessCallback[2][kNumberOfAccessSizes];
-  // This array is indexed by AccessIsWrite.
-  Function *AsanErrorCallbackSized[2],
-           *AsanMemoryAccessCallbackSized[2];
+  // This array is indexed by AccessIsWrite, Experiment and log2(AccessSize).
+  Function *AsanErrorCallback[2][2][kNumberOfAccessSizes];
+  Function *AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes];
+  // This array is indexed by AccessIsWrite and Experiment.
+  Function *AsanErrorCallbackSized[2][2];
+  Function *AsanMemoryAccessCallbackSized[2][2];
   Function *AsanMemmove, *AsanMemcpy, *AsanMemset;
   InlineAsm *EmptyAsm;
   GlobalsMetadata GlobalsMD;
@@ -426,9 +467,7 @@ class AddressSanitizerModule : public ModulePass {
   AddressSanitizerModule() : ModulePass(ID) {}
   bool runOnModule(Module &M) override;
   static char ID;  // Pass identification, replacement for typeid
-  const char *getPassName() const override {
-    return "AddressSanitizerModule";
-  }
+  const char *getPassName() const override { return "AddressSanitizerModule"; }
 
  private:
   void initializeCallbacks(Module &M);
@@ -444,7 +483,6 @@ class AddressSanitizerModule : public ModulePass {
   GlobalsMetadata GlobalsMD;
   Type *IntptrTy;
   LLVMContext *C;
-  const DataLayout *DL;
   Triple TargetTriple;
   ShadowMapping Mapping;
   Function *AsanPoisonGlobals;
@@ -471,12 +509,12 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   Type *IntptrPtrTy;
   ShadowMapping Mapping;
 
-  SmallVector<AllocaInst*, 16> AllocaVec;
-  SmallVector<Instruction*, 8> RetVec;
+  SmallVector<AllocaInst *, 16> AllocaVec;
+  SmallVector<Instruction *, 8> RetVec;
   unsigned StackAlignment;
 
   Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
-           *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
+      *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
   Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
 
   // Stores a place and arguments of poisoning/unpoisoning call for alloca.
@@ -497,33 +535,38 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     Value *LeftRzAddr;
     Value *RightRzAddr;
     bool Poison;
-    explicit DynamicAllocaCall(AllocaInst *AI,
-                      Value *LeftRzAddr = nullptr,
-                      Value *RightRzAddr = nullptr)
-      : AI(AI), LeftRzAddr(LeftRzAddr), RightRzAddr(RightRzAddr), Poison(true)
-    {}
+    explicit DynamicAllocaCall(AllocaInst *AI, Value *LeftRzAddr = nullptr,
+                               Value *RightRzAddr = nullptr)
+        : AI(AI),
+          LeftRzAddr(LeftRzAddr),
+          RightRzAddr(RightRzAddr),
+          Poison(true) {}
   };
   SmallVector<DynamicAllocaCall, 1> DynamicAllocaVec;
 
   // Maps Value to an AllocaInst from which the Value is originated.
-  typedef DenseMap<Value*, AllocaInst*> AllocaForValueMapTy;
+  typedef DenseMap<Value *, AllocaInst *> AllocaForValueMapTy;
   AllocaForValueMapTy AllocaForValue;
 
   bool HasNonEmptyInlineAsm;
   std::unique_ptr<CallInst> EmptyInlineAsm;
 
   FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
-      : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false),
-        C(ASan.C), IntptrTy(ASan.IntptrTy),
-        IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping),
-        StackAlignment(1 << Mapping.Scale), HasNonEmptyInlineAsm(false),
+      : F(F),
+        ASan(ASan),
+        DIB(*F.getParent(), /*AllowUnresolved*/ false),
+        C(ASan.C),
+        IntptrTy(ASan.IntptrTy),
+        IntptrPtrTy(PointerType::get(IntptrTy, 0)),
+        Mapping(ASan.Mapping),
+        StackAlignment(1 << Mapping.Scale),
+        HasNonEmptyInlineAsm(false),
         EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {}
 
   bool runOnFunction() {
     if (!ClStack) return false;
     // Collect alloca, ret, lifetime instructions etc.
-    for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
-      visit(*BB);
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
 
     if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false;
 
@@ -544,33 +587,31 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
   // ----------------------- Visitors.
   /// \brief Collect all Ret instructions.
-  void visitReturnInst(ReturnInst &RI) {
-    RetVec.push_back(&RI);
-  }
+  void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); }
 
   // Unpoison dynamic allocas redzones.
   void unpoisonDynamicAlloca(DynamicAllocaCall &AllocaCall) {
-    if (!AllocaCall.Poison)
-      return;
+    if (!AllocaCall.Poison) return;
     for (auto Ret : RetVec) {
       IRBuilder<> IRBRet(Ret);
       PointerType *Int32PtrTy = PointerType::getUnqual(IRBRet.getInt32Ty());
       Value *Zero = Constant::getNullValue(IRBRet.getInt32Ty());
       Value *PartialRzAddr = IRBRet.CreateSub(AllocaCall.RightRzAddr,
                                               ConstantInt::get(IntptrTy, 4));
-      IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(AllocaCall.LeftRzAddr,
-                                                     Int32PtrTy));
-      IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(PartialRzAddr,
-                                                     Int32PtrTy));
-      IRBRet.CreateStore(Zero, IRBRet.CreateIntToPtr(AllocaCall.RightRzAddr,
-                                                     Int32PtrTy));
+      IRBRet.CreateStore(
+          Zero, IRBRet.CreateIntToPtr(AllocaCall.LeftRzAddr, Int32PtrTy));
+      IRBRet.CreateStore(Zero,
+                         IRBRet.CreateIntToPtr(PartialRzAddr, Int32PtrTy));
+      IRBRet.CreateStore(
+          Zero, IRBRet.CreateIntToPtr(AllocaCall.RightRzAddr, Int32PtrTy));
     }
   }
 
   // Right shift for BigEndian and left shift for LittleEndian.
   Value *shiftAllocaMagic(Value *Val, IRBuilder<> &IRB, Value *Shift) {
-    return ASan.DL->isLittleEndian() ? IRB.CreateShl(Val, Shift)
-                                     : IRB.CreateLShr(Val, Shift);
+    auto &DL = F.getParent()->getDataLayout();
+    return DL.isLittleEndian() ? IRB.CreateShl(Val, Shift)
+                               : IRB.CreateLShr(Val, Shift);
   }
 
   // Compute PartialRzMagic for dynamic alloca call. Since we don't know the
@@ -599,7 +640,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
   /// \brief Collect Alloca instructions we want (and can) handle.
   void visitAllocaInst(AllocaInst &AI) {
-    if (!isInterestingAlloca(AI)) return;
+    if (!ASan.isInterestingAlloca(AI)) return;
 
     StackAlignment = std::max(StackAlignment, AI.getAlignment());
     if (isDynamicAlloca(AI))
@@ -613,8 +654,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void visitIntrinsicInst(IntrinsicInst &II) {
     if (!ClCheckLifetime) return;
     Intrinsic::ID ID = II.getIntrinsicID();
-    if (ID != Intrinsic::lifetime_start &&
-        ID != Intrinsic::lifetime_end)
+    if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end)
       return;
     // Found lifetime intrinsic, add ASan instrumentation if necessary.
     ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
@@ -644,8 +684,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
   bool doesDominateAllExits(const Instruction *I) const {
     for (auto Ret : RetVec) {
-      if (!ASan.getDominatorTree().dominates(I, Ret))
-        return false;
+      if (!ASan.getDominatorTree().dominates(I, Ret)) return false;
     }
     return true;
   }
@@ -653,19 +692,6 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   bool isDynamicAlloca(AllocaInst &AI) const {
     return AI.isArrayAllocation() || !AI.isStaticAlloca();
   }
-
-  // Check if we want (and can) handle this alloca.
-  bool isInterestingAlloca(AllocaInst &AI) const {
-    return (AI.getAllocatedType()->isSized() &&
-            // alloca() may be called with 0 size, ignore it.
-            getAllocaSizeInBytes(&AI) > 0);
-  }
-
-  uint64_t getAllocaSizeInBytes(AllocaInst *AI) const {
-    Type *Ty = AI->getAllocatedType();
-    uint64_t SizeInBytes = ASan.DL->getTypeAllocSize(Ty);
-    return SizeInBytes;
-  }
   /// Finds alloca where the value comes from.
   AllocaInst *findAllocaForValue(Value *V);
   void poisonRedZones(ArrayRef<uint8_t> ShadowBytes, IRBuilder<> &IRB,
@@ -683,21 +709,25 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 }  // namespace
 
 char AddressSanitizer::ID = 0;
-INITIALIZE_PASS_BEGIN(AddressSanitizer, "asan",
-    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
-    false, false)
+INITIALIZE_PASS_BEGIN(
+    AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(AddressSanitizer, "asan",
-    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
-    false, false)
+INITIALIZE_PASS_END(
+    AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
 FunctionPass *llvm::createAddressSanitizerFunctionPass() {
   return new AddressSanitizer();
 }
 
 char AddressSanitizerModule::ID = 0;
-INITIALIZE_PASS(AddressSanitizerModule, "asan-module",
+INITIALIZE_PASS(
+    AddressSanitizerModule, "asan-module",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
-    "ModulePass", false, false)
+    "ModulePass",
+    false, false)
 ModulePass *llvm::createAddressSanitizerModulePass() {
   return new AddressSanitizerModule();
 }
@@ -709,16 +739,15 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
 }
 
 // \brief Create a constant for Str so that we can pass it to the run-time lib.
-static GlobalVariable *createPrivateGlobalForString(
-    Module &M, StringRef Str, bool AllowMerging) {
+static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
+                                                    bool AllowMerging) {
   Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
   // We use private linkage for module-local strings. If they can be merged
   // with another one, we set the unnamed_addr attribute.
   GlobalVariable *GV =
       new GlobalVariable(M, StrConst->getType(), true,
                          GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix);
-  if (AllowMerging)
-    GV->setUnnamedAddr(true);
+  if (AllowMerging) GV->setUnnamedAddr(true);
   GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
   return GV;
 }
@@ -747,8 +776,7 @@ static bool GlobalWasGeneratedByAsan(GlobalVariable *G) {
 Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
   // Shadow >> scale
   Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
-  if (Mapping.Offset == 0)
-    return Shadow;
+  if (Mapping.Offset == 0) return Shadow;
   // (Shadow >> scale) | offset
   if (Mapping.OrShadowOffset)
     return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
@@ -775,38 +803,61 @@ void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
   MI->eraseFromParent();
 }
 
-// If I is an interesting memory access, return the PointerOperand
-// and set IsWrite/Alignment. Otherwise return nullptr.
-static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
-                                        unsigned *Alignment) {
+/// Check if we want (and can) handle this alloca.
+bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) const {
+  return (AI.getAllocatedType()->isSized() &&
+          // alloca() may be called with 0 size, ignore it.
+          getAllocaSizeInBytes(&AI) > 0 &&
+          // We are only interested in allocas not promotable to registers.
+          // Promotable allocas are common under -O0.
+          (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)));
+}
+
+/// If I is an interesting memory access, return the PointerOperand
+/// and set IsWrite/Alignment. Otherwise return nullptr.
+Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
+                                                   bool *IsWrite,
+                                                   uint64_t *TypeSize,
+                                                   unsigned *Alignment) const {
   // Skip memory accesses inserted by another instrumentation.
-  if (I->getMetadata("nosanitize"))
-    return nullptr;
+  if (I->getMetadata("nosanitize")) return nullptr;
+
+  Value *PtrOperand = nullptr;
+  const DataLayout &DL = I->getModule()->getDataLayout();
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     if (!ClInstrumentReads) return nullptr;
     *IsWrite = false;
+    *TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
     *Alignment = LI->getAlignment();
-    return LI->getPointerOperand();
-  }
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    PtrOperand = LI->getPointerOperand();
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     if (!ClInstrumentWrites) return nullptr;
     *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
     *Alignment = SI->getAlignment();
-    return SI->getPointerOperand();
-  }
-  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    PtrOperand = SI->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
     if (!ClInstrumentAtomics) return nullptr;
     *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
     *Alignment = 0;
-    return RMW->getPointerOperand();
-  }
-  if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    PtrOperand = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
     if (!ClInstrumentAtomics) return nullptr;
     *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
     *Alignment = 0;
-    return XCHG->getPointerOperand();
+    PtrOperand = XCHG->getPointerOperand();
   }
-  return nullptr;
+
+  // Treat memory accesses to promotable allocas as non-interesting since they
+  // will not cause memory violations. This greatly speeds up the instrumented
+  // executable at -O0.
+  if (ClSkipPromotableAllocas)
+    if (auto AI = dyn_cast_or_null<AllocaInst>(PtrOperand))
+      return isInterestingAlloca(*AI) ? AI : nullptr;
+
+  return PtrOperand;
 }
 
 static bool isPointerOperand(Value *V) {
@@ -818,17 +869,15 @@ static bool isPointerOperand(Value *V) {
 // the frontend.
 static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) {
   if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) {
-    if (!Cmp->isRelational())
-      return false;
+    if (!Cmp->isRelational()) return false;
   } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-    if (BO->getOpcode() != Instruction::Sub)
-      return false;
+    if (BO->getOpcode() != Instruction::Sub) return false;
   } else {
     return false;
   }
   if (!isPointerOperand(I->getOperand(0)) ||
       !isPointerOperand(I->getOperand(1)))
-      return false;
+    return false;
   return true;
 }
 
@@ -839,8 +888,8 @@ bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
   return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
 }
 
-void
-AddressSanitizer::instrumentPointerComparisonOrSubtraction(Instruction *I) {
+void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
+    Instruction *I) {
   IRBuilder<> IRB(I);
   Function *F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction;
   Value *Param[2] = {I->getOperand(0), I->getOperand(1)};
@@ -851,38 +900,47 @@ AddressSanitizer::instrumentPointerComparisonOrSubtraction(Instruction *I) {
   IRB.CreateCall2(F, Param[0], Param[1]);
 }
 
-void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) {
+void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                     Instruction *I, bool UseCalls,
+                                     const DataLayout &DL) {
   bool IsWrite = false;
   unsigned Alignment = 0;
-  Value *Addr = isInterestingMemoryAccess(I, &IsWrite, &Alignment);
+  uint64_t TypeSize = 0;
+  Value *Addr = isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment);
   assert(Addr);
+
+  // Optimization experiments.
+  // The experiments can be used to evaluate potential optimizations that remove
+  // instrumentation (assess false negatives). Instead of completely removing
+  // some instrumentation, you set Exp to a non-zero value (mask of optimization
+  // experiments that want to remove instrumentation of this instruction).
+  // If Exp is non-zero, this pass will emit special calls into runtime
+  // (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls
+  // make runtime terminate the program in a special way (with a different
+  // exit status). Then you run the new compiler on a buggy corpus, collect
+  // the special terminations (ideally, you don't see them at all -- no false
+  // negatives) and make the decision on the optimization.
+  uint32_t Exp = ClForceExperiment;
+
   if (ClOpt && ClOptGlobals) {
-    if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
-      // If initialization order checking is disabled, a simple access to a
-      // dynamically initialized global is always valid.
-      if (!ClInitializers || GlobalIsLinkerInitialized(G)) {
-        NumOptimizedAccessesToGlobalVar++;
-        return;
-      }
-    }
-    ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr);
-    if (CE && CE->isGEPWithNoNotionalOverIndexing()) {
-      if (GlobalVariable *G = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
-        if (CE->getOperand(1)->isNullValue() && GlobalIsLinkerInitialized(G)) {
-          NumOptimizedAccessesToGlobalArray++;
-          return;
-        }
-      }
+    // If initialization order checking is disabled, a simple access to a
+    // dynamically initialized global is always valid.
+    GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL));
+    if (G != NULL && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
+        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+      NumOptimizedAccessesToGlobalVar++;
+      return;
     }
   }
 
-  Type *OrigPtrTy = Addr->getType();
-  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
-
-  assert(OrigTy->isSized());
-  uint32_t TypeSize = DL->getTypeStoreSizeInBits(OrigTy);
-
-  assert((TypeSize % 8) == 0);
+  if (ClOpt && ClOptStack) {
+    // A direct inbounds access to a stack variable is always valid.
+    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
+        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+      NumOptimizedAccessesToStackVar++;
+      return;
+    }
+  }
 
   if (IsWrite)
     NumInstrumentedWrites++;
@@ -895,23 +953,10 @@ void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) {
   if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
        TypeSize == 128) &&
       (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
-    return instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, UseCalls);
-  // Instrument unusual size or unusual alignment.
-  // We can not do it with a single check, so we do 1-byte check for the first
-  // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
-  // to report the actual access size.
-  IRBuilder<> IRB(I);
-  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
-  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
-  if (UseCalls) {
-    IRB.CreateCall2(AsanMemoryAccessCallbackSized[IsWrite], AddrLong, Size);
-  } else {
-    Value *LastByte = IRB.CreateIntToPtr(
-        IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
-        OrigPtrTy);
-    instrumentAddress(I, I, Addr, 8, IsWrite, Size, false);
-    instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false);
-  }
+    return instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, UseCalls,
+                             Exp);
+  instrumentUnusualSizeOrAlignment(I, Addr, TypeSize, IsWrite, nullptr,
+                                   UseCalls, Exp);
 }
 
 // Validate the result of Module::getOrInsertFunction called for an interface
@@ -921,17 +966,34 @@ void AddressSanitizer::instrumentMop(Instruction *I, bool UseCalls) {
 static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
   if (isa<Function>(FuncOrBitcast)) return cast<Function>(FuncOrBitcast);
   FuncOrBitcast->dump();
-  report_fatal_error("trying to redefine an AddressSanitizer "
-                     "interface function");
+  report_fatal_error(
+      "trying to redefine an AddressSanitizer "
+      "interface function");
 }
 
-Instruction *AddressSanitizer::generateCrashCode(
-    Instruction *InsertBefore, Value *Addr,
-    bool IsWrite, size_t AccessSizeIndex, Value *SizeArgument) {
+Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
+                                                 Value *Addr, bool IsWrite,
+                                                 size_t AccessSizeIndex,
+                                                 Value *SizeArgument,
+                                                 uint32_t Exp) {
   IRBuilder<> IRB(InsertBefore);
-  CallInst *Call = SizeArgument
-    ? IRB.CreateCall2(AsanErrorCallbackSized[IsWrite], Addr, SizeArgument)
-    : IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr);
+  Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp);
+  CallInst *Call = nullptr;
+  if (SizeArgument) {
+    if (Exp == 0)
+      Call = IRB.CreateCall2(AsanErrorCallbackSized[IsWrite][0], Addr,
+                             SizeArgument);
+    else
+      Call = IRB.CreateCall3(AsanErrorCallbackSized[IsWrite][1], Addr,
+                             SizeArgument, ExpVal);
+  } else {
+    if (Exp == 0)
+      Call =
+          IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr);
+    else
+      Call = IRB.CreateCall2(AsanErrorCallback[IsWrite][1][AccessSizeIndex],
+                             Addr, ExpVal);
+  }
 
   // We don't do Call->setDoesNotReturn() because the BB already has
   // UnreachableInst at the end.
@@ -941,19 +1003,19 @@ Instruction *AddressSanitizer::generateCrashCode(
 }
 
 Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
-                                            Value *ShadowValue,
-                                            uint32_t TypeSize) {
+                                           Value *ShadowValue,
+                                           uint32_t TypeSize) {
   size_t Granularity = 1 << Mapping.Scale;
   // Addr & (Granularity - 1)
-  Value *LastAccessedByte = IRB.CreateAnd(
-      AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  Value *LastAccessedByte =
+      IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
   // (Addr & (Granularity - 1)) + size - 1
   if (TypeSize / 8 > 1)
     LastAccessedByte = IRB.CreateAdd(
         LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
   // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
-  LastAccessedByte = IRB.CreateIntCast(
-      LastAccessedByte, ShadowValue->getType(), false);
+  LastAccessedByte =
+      IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
   // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
   return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
 }
@@ -961,24 +1023,29 @@ Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
 void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
                                          Instruction *InsertBefore, Value *Addr,
                                          uint32_t TypeSize, bool IsWrite,
-                                         Value *SizeArgument, bool UseCalls) {
+                                         Value *SizeArgument, bool UseCalls,
+                                         uint32_t Exp) {
   IRBuilder<> IRB(InsertBefore);
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
   size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
 
   if (UseCalls) {
-    IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][AccessSizeIndex],
-                   AddrLong);
+    if (Exp == 0)
+      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
+                     AddrLong);
+    else
+      IRB.CreateCall2(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex],
+                      AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp));
     return;
   }
 
-  Type *ShadowTy  = IntegerType::get(
-      *C, std::max(8U, TypeSize >> Mapping.Scale));
+  Type *ShadowTy =
+      IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
   Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   Value *CmpVal = Constant::getNullValue(ShadowTy);
-  Value *ShadowValue = IRB.CreateLoad(
-      IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+  Value *ShadowValue =
+      IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
   size_t Granularity = 1 << Mapping.Scale;
@@ -987,9 +1054,8 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
     // We use branch weights for the slow path check, to indicate that the slow
     // path is rarely taken. This seems to be the case for SPEC benchmarks.
-    TerminatorInst *CheckTerm =
-        SplitBlockAndInsertIfThen(Cmp, InsertBefore, false,
-            MDBuilder(*C).createBranchWeights(1, 100000));
+    TerminatorInst *CheckTerm = SplitBlockAndInsertIfThen(
+        Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
     assert(dyn_cast<BranchInst>(CheckTerm)->isUnconditional());
     BasicBlock *NextBB = CheckTerm->getSuccessor(0);
     IRB.SetInsertPoint(CheckTerm);
@@ -1003,11 +1069,37 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, true);
   }
 
-  Instruction *Crash = generateCrashCode(
-      CrashTerm, AddrLong, IsWrite, AccessSizeIndex, SizeArgument);
+  Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite,
+                                         AccessSizeIndex, SizeArgument, Exp);
   Crash->setDebugLoc(OrigIns->getDebugLoc());
 }
 
+// Instrument unusual size or unusual alignment.
+// We can not do it with a single check, so we do 1-byte check for the first
+// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+// to report the actual access size.
+void AddressSanitizer::instrumentUnusualSizeOrAlignment(
+    Instruction *I, Value *Addr, uint32_t TypeSize, bool IsWrite,
+    Value *SizeArgument, bool UseCalls, uint32_t Exp) {
+  IRBuilder<> IRB(I);
+  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  if (UseCalls) {
+    if (Exp == 0)
+      IRB.CreateCall2(AsanMemoryAccessCallbackSized[IsWrite][0], AddrLong,
+                      Size);
+    else
+      IRB.CreateCall3(AsanMemoryAccessCallbackSized[IsWrite][1], AddrLong, Size,
+                      ConstantInt::get(IRB.getInt32Ty(), Exp));
+  } else {
+    Value *LastByte = IRB.CreateIntToPtr(
+        IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+        Addr->getType());
+    instrumentAddress(I, I, Addr, 8, IsWrite, Size, false, Exp);
+    instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false, Exp);
+  }
+}
+
 void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit,
                                                   GlobalValue *ModuleName) {
   // Set up the arguments to our poison/unpoison functions.
@@ -1029,12 +1121,11 @@ void AddressSanitizerModule::createInitializerPoisonCalls(
 
   ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
   for (Use &OP : CA->operands()) {
-    if (isa<ConstantAggregateZero>(OP))
-      continue;
+    if (isa<ConstantAggregateZero>(OP)) continue;
     ConstantStruct *CS = cast<ConstantStruct>(OP);
 
     // Must have a function or null ptr.
-    if (Function* F = dyn_cast<Function>(CS->getOperand(1))) {
+    if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
       if (F->getName() == kAsanModuleCtorName) continue;
       ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
       // Don't instrument CTORs that will run before asan.module_ctor.
@@ -1059,13 +1150,11 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
       G->getLinkage() != GlobalVariable::PrivateLinkage &&
       G->getLinkage() != GlobalVariable::InternalLinkage)
     return false;
-  if (G->hasComdat())
-    return false;
+  if (G->hasComdat()) return false;
   // Two problems with thread-locals:
   //   - The address of the main thread's copy can't be computed at link-time.
   //   - Need to poison all copies, not just the main thread's one.
-  if (G->isThreadLocal())
-    return false;
+  if (G->isThreadLocal()) return false;
   // For now, just ignore this Global if the alignment is large.
   if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false;
 
@@ -1076,10 +1165,8 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
       StringRef ParsedSegment, ParsedSection;
       unsigned TAA = 0, StubSize = 0;
       bool TAAParsed;
-      std::string ErrorCode =
-        MCSectionMachO::ParseSectionSpecifier(Section, ParsedSegment,
-                                              ParsedSection, TAA, TAAParsed,
-                                              StubSize);
+      std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(
+          Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);
       if (!ErrorCode.empty()) {
         report_fatal_error("Invalid section specifier '" + ParsedSection +
                            "': " + ErrorCode + ".");
@@ -1140,12 +1227,11 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
   AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
   // Declare functions that register/unregister globals.
   AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanRegisterGlobalsName, IRB.getVoidTy(),
-      IntptrTy, IntptrTy, nullptr));
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
   AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
-  AsanUnregisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanUnregisterGlobalsName,
-      IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+  AsanUnregisterGlobals = checkInterfaceFunction(
+      M.getOrInsertFunction(kAsanUnregisterGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, nullptr));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
 }
 
@@ -1158,8 +1244,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
   SmallVector<GlobalVariable *, 16> GlobalsToChange;
 
   for (auto &G : M.globals()) {
-    if (ShouldInstrumentGlobal(&G))
-      GlobalsToChange.push_back(&G);
+    if (ShouldInstrumentGlobal(&G)) GlobalsToChange.push_back(&G);
   }
 
   size_t n = GlobalsToChange.size();
@@ -1184,8 +1269,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
   // We shouldn't merge same module names, as this string serves as unique
   // module ID in runtime.
   GlobalVariable *ModuleName = createPrivateGlobalForString(
-      M, M.getModuleIdentifier(), /*AllowMerging*/false);
+      M, M.getModuleIdentifier(), /*AllowMerging*/ false);
 
+  auto &DL = M.getDataLayout();
   for (size_t i = 0; i < n; i++) {
     static const uint64_t kMaxGlobalRedzone = 1 << 18;
     GlobalVariable *G = GlobalsToChange[i];
@@ -1199,32 +1285,30 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
 
     PointerType *PtrTy = cast<PointerType>(G->getType());
     Type *Ty = PtrTy->getElementType();
-    uint64_t SizeInBytes = DL->getTypeAllocSize(Ty);
+    uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
     uint64_t MinRZ = MinRedzoneSizeForGlobal();
     // MinRZ <= RZ <= kMaxGlobalRedzone
     // and trying to make RZ to be ~ 1/4 of SizeInBytes.
-    uint64_t RZ = std::max(MinRZ,
-                         std::min(kMaxGlobalRedzone,
-                                  (SizeInBytes / MinRZ / 4) * MinRZ));
+    uint64_t RZ = std::max(
+        MinRZ, std::min(kMaxGlobalRedzone, (SizeInBytes / MinRZ / 4) * MinRZ));
     uint64_t RightRedzoneSize = RZ;
     // Round up to MinRZ
-    if (SizeInBytes % MinRZ)
-      RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
+    if (SizeInBytes % MinRZ) RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
     assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
 
     StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr);
-    Constant *NewInitializer = ConstantStruct::get(
-        NewTy, G->getInitializer(),
-        Constant::getNullValue(RightRedZoneTy), nullptr);
+    Constant *NewInitializer =
+        ConstantStruct::get(NewTy, G->getInitializer(),
+                            Constant::getNullValue(RightRedZoneTy), nullptr);
 
     // Create a new global variable with enough space for a redzone.
     GlobalValue::LinkageTypes Linkage = G->getLinkage();
     if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage)
       Linkage = GlobalValue::InternalLinkage;
-    GlobalVariable *NewGlobal = new GlobalVariable(
-        M, NewTy, G->isConstant(), Linkage,
-        NewInitializer, "", G, G->getThreadLocalMode());
+    GlobalVariable *NewGlobal =
+        new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer,
+                           "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
     NewGlobal->setAlignment(MinRZ);
 
@@ -1253,8 +1337,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
         ConstantExpr::getPointerCast(ModuleName, IntptrTy),
         ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc, nullptr);
 
-    if (ClInitializers && MD.IsDynInit)
-      HasDynamicallyInitializedGlobals = true;
+    if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
 
     DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
   }
@@ -1273,9 +1356,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
 
   // We also need to unregister globals at the end, e.g. when a shared library
   // gets closed.
-  Function *AsanDtorFunction = Function::Create(
-      FunctionType::get(Type::getVoidTy(*C), false),
-      GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
+  Function *AsanDtorFunction =
+      Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
+                       GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
   BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
   IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB));
   IRB_Dtor.CreateCall2(AsanUnregisterGlobals,
@@ -1288,12 +1371,8 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
 }
 
 bool AddressSanitizerModule::runOnModule(Module &M) {
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP)
-    return false;
-  DL = &DLP->getDataLayout();
   C = &(M.getContext());
-  int LongSize = DL->getPointerSizeInBits();
+  int LongSize = M.getDataLayout().getPointerSizeInBits();
   IntptrTy = Type::getIntNTy(*C, LongSize);
   TargetTriple = Triple(M.getTargetTriple());
   Mapping = getShadowMapping(TargetTriple, LongSize);
@@ -1305,8 +1384,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   assert(CtorFunc);
   IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
 
-  if (ClGlobals)
-    Changed |= InstrumentGlobals(IRB, M);
+  if (ClGlobals) Changed |= InstrumentGlobals(IRB, M);
 
   return Changed;
 }
@@ -1314,33 +1392,34 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
 void AddressSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
   // Create __asan_report* callbacks.
-  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
-    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
-         AccessSizeIndex++) {
-      // IsWrite and TypeSize are encoded in the function name.
-      std::string Suffix =
-          (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex);
-      AsanErrorCallback[AccessIsWrite][AccessSizeIndex] =
-          checkInterfaceFunction(
-              M.getOrInsertFunction(kAsanReportErrorTemplate + Suffix,
-                                    IRB.getVoidTy(), IntptrTy, nullptr));
-      AsanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
-          checkInterfaceFunction(
-              M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + Suffix,
-                                    IRB.getVoidTy(), IntptrTy, nullptr));
+  // IsWrite, TypeSize and Exp are encoded in the function name.
+  for (int Exp = 0; Exp < 2; Exp++) {
+    for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+      const std::string TypeStr = AccessIsWrite ? "store" : "load";
+      const std::string ExpStr = Exp ? "exp_" : "";
+      const Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr;
+      AsanErrorCallbackSized[AccessIsWrite][Exp] =
+          checkInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportErrorTemplate + ExpStr + TypeStr + "_n",
+              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
+      AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
+          checkInterfaceFunction(M.getOrInsertFunction(
+              ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N",
+              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
+      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+           AccessSizeIndex++) {
+        const std::string Suffix = TypeStr + itostr(1 << AccessSizeIndex);
+        AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+            checkInterfaceFunction(M.getOrInsertFunction(
+                kAsanReportErrorTemplate + ExpStr + Suffix, IRB.getVoidTy(),
+                IntptrTy, ExpType, nullptr));
+        AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+            checkInterfaceFunction(M.getOrInsertFunction(
+                ClMemoryAccessCallbackPrefix + ExpStr + Suffix, IRB.getVoidTy(),
+                IntptrTy, ExpType, nullptr));
+      }
     }
   }
-  AsanErrorCallbackSized[0] = checkInterfaceFunction(M.getOrInsertFunction(
-              kAsanReportLoadN, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
-  AsanErrorCallbackSized[1] = checkInterfaceFunction(M.getOrInsertFunction(
-              kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
-
-  AsanMemoryAccessCallbackSized[0] = checkInterfaceFunction(
-      M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "loadN",
-                            IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
-  AsanMemoryAccessCallbackSized[1] = checkInterfaceFunction(
-      M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "storeN",
-                            IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
 
   AsanMemmove = checkInterfaceFunction(M.getOrInsertFunction(
       ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
@@ -1368,21 +1447,17 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
 // virtual
 bool AddressSanitizer::doInitialization(Module &M) {
   // Initialize the private fields. No one has accessed them before.
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP)
-    report_fatal_error("data layout missing");
-  DL = &DLP->getDataLayout();
 
   GlobalsMD.init(M);
 
   C = &(M.getContext());
-  LongSize = DL->getPointerSizeInBits();
+  LongSize = M.getDataLayout().getPointerSizeInBits();
   IntptrTy = Type::getIntNTy(*C, LongSize);
   TargetTriple = Triple(M.getTargetTriple());
 
-  AsanCtorFunction = Function::Create(
-      FunctionType::get(Type::getVoidTy(*C), false),
-      GlobalValue::InternalLinkage, kAsanModuleCtorName, &M);
+  AsanCtorFunction =
+      Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
+                       GlobalValue::InternalLinkage, kAsanModuleCtorName, &M);
   BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction);
   // call __asan_init in the module ctor.
   IRBuilder<> IRB(ReturnInst::Create(*C, AsanCtorBB));
@@ -1424,22 +1499,21 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   // If needed, insert __asan_init before checking for SanitizeAddress attr.
   maybeInsertAsanInitAtFunctionEntry(F);
 
-  if (!F.hasFnAttribute(Attribute::SanitizeAddress))
-    return false;
+  if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return false;
 
-  if (!ClDebugFunc.empty() && ClDebugFunc != F.getName())
-    return false;
+  if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) return false;
 
   // We want to instrument every address only once per basic block (unless there
   // are calls between uses).
-  SmallSet<Value*, 16> TempsToInstrument;
-  SmallVector<Instruction*, 16> ToInstrument;
-  SmallVector<Instruction*, 8> NoReturnCalls;
-  SmallVector<BasicBlock*, 16> AllBlocks;
-  SmallVector<Instruction*, 16> PointerComparisonsOrSubtracts;
+  SmallSet<Value *, 16> TempsToInstrument;
+  SmallVector<Instruction *, 16> ToInstrument;
+  SmallVector<Instruction *, 8> NoReturnCalls;
+  SmallVector<BasicBlock *, 16> AllBlocks;
+  SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
   int NumAllocas = 0;
   bool IsWrite;
   unsigned Alignment;
+  uint64_t TypeSize;
 
   // Fill the set of memory operations to instrument.
   for (auto &BB : F) {
@@ -1448,8 +1522,8 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     int NumInsnsPerBB = 0;
     for (auto &Inst : BB) {
       if (LooksLikeCodeInBug11395(&Inst)) return false;
-      if (Value *Addr =
-              isInterestingMemoryAccess(&Inst, &IsWrite, &Alignment)) {
+      if (Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
+                                                  &Alignment)) {
         if (ClOpt && ClOptSameTemp) {
           if (!TempsToInstrument.insert(Addr).second)
             continue;  // We've seen this temp in the current BB.
@@ -1461,21 +1535,18 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       } else if (isa<MemIntrinsic>(Inst)) {
         // ok, take it.
       } else {
-        if (isa<AllocaInst>(Inst))
-          NumAllocas++;
+        if (isa<AllocaInst>(Inst)) NumAllocas++;
         CallSite CS(&Inst);
         if (CS) {
           // A call inside BB.
           TempsToInstrument.clear();
-          if (CS.doesNotReturn())
-            NoReturnCalls.push_back(CS.getInstruction());
+          if (CS.doesNotReturn()) NoReturnCalls.push_back(CS.getInstruction());
         }
         continue;
       }
       ToInstrument.push_back(&Inst);
       NumInsnsPerBB++;
-      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB)
-        break;
+      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
     }
   }
 
@@ -1484,13 +1555,20 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold)
     UseCalls = true;
 
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(),
+                                     /*RoundToAlign=*/true);
+
   // Instrument.
   int NumInstrumented = 0;
   for (auto Inst : ToInstrument) {
     if (ClDebugMin < 0 || ClDebugMax < 0 ||
         (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
-      if (isInterestingMemoryAccess(Inst, &IsWrite, &Alignment))
-        instrumentMop(Inst, UseCalls);
+      if (isInterestingMemoryAccess(Inst, &IsWrite, &TypeSize, &Alignment))
+        instrumentMop(ObjSizeVis, Inst, UseCalls,
+                      F.getParent()->getDataLayout());
       else
         instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
     }
@@ -1549,10 +1627,9 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
                             IntptrTy, IntptrTy, nullptr));
 }
 
-void
-FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes,
-                                      IRBuilder<> &IRB, Value *ShadowBase,
-                                      bool DoPoison) {
+void FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes,
+                                           IRBuilder<> &IRB, Value *ShadowBase,
+                                           bool DoPoison) {
   size_t n = ShadowBytes.size();
   size_t i = 0;
   // We need to (un)poison n bytes of stack shadow. Poison as many as we can
@@ -1563,7 +1640,7 @@ FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes,
     for (; i + LargeStoreSizeInBytes - 1 < n; i += LargeStoreSizeInBytes) {
       uint64_t Val = 0;
       for (size_t j = 0; j < LargeStoreSizeInBytes; j++) {
-        if (ASan.DL->isLittleEndian())
+        if (F.getParent()->getDataLayout().isLittleEndian())
           Val |= (uint64_t)ShadowBytes[i + j] << (8 * j);
         else
           Val = (Val << 8) | ShadowBytes[i + j];
@@ -1582,9 +1659,8 @@ FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes,
 static int StackMallocSizeClass(uint64_t LocalStackSize) {
   assert(LocalStackSize <= kMaxStackMallocSize);
   uint64_t MaxSize = kMinStackMallocSize;
-  for (int i = 0; ; i++, MaxSize *= 2)
-    if (LocalStackSize <= MaxSize)
-      return i;
+  for (int i = 0;; i++, MaxSize *= 2)
+    if (LocalStackSize <= MaxSize) return i;
   llvm_unreachable("impossible LocalStackSize");
 }
 
@@ -1596,18 +1672,21 @@ static int StackMallocSizeClass(uint64_t LocalStackSize) {
 void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined(
     IRBuilder<> &IRB, Value *ShadowBase, int Size) {
   assert(!(Size % 8));
-  assert(kAsanStackAfterReturnMagic == 0xf5);
+
+  // kAsanStackAfterReturnMagic is 0xf5.
+  const uint64_t kAsanStackAfterReturnMagic64 = 0xf5f5f5f5f5f5f5f5ULL;
+
   for (int i = 0; i < Size; i += 8) {
     Value *p = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
-    IRB.CreateStore(ConstantInt::get(IRB.getInt64Ty(), 0xf5f5f5f5f5f5f5f5ULL),
-                    IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo()));
+    IRB.CreateStore(
+        ConstantInt::get(IRB.getInt64Ty(), kAsanStackAfterReturnMagic64),
+        IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo()));
   }
 }
 
 static DebugLoc getFunctionEntryDebugLocation(Function &F) {
   for (const auto &Inst : F.getEntryBlock())
-    if (!isa<AllocaInst>(Inst))
-      return Inst.getDebugLoc();
+    if (!isa<AllocaInst>(Inst)) return Inst.getDebugLoc();
   return DebugLoc();
 }
 
@@ -1664,9 +1743,9 @@ void FunctionStackPoisoner::poisonStack() {
   SmallVector<ASanStackVariableDescription, 16> SVD;
   SVD.reserve(AllocaVec.size());
   for (AllocaInst *AI : AllocaVec) {
-    ASanStackVariableDescription D = { AI->getName().data(),
-                                   getAllocaSizeInBytes(AI),
-                                   AI->getAlignment(), AI, 0};
+    ASanStackVariableDescription D = {AI->getName().data(),
+                                      ASan.getAllocaSizeInBytes(AI),
+                                      AI->getAlignment(), AI, 0};
     SVD.push_back(D);
   }
   // Minimal header size (left redzone) is 4 pointers,
@@ -1757,19 +1836,19 @@ void FunctionStackPoisoner::poisonStack() {
                   BasePlus0);
   // Write the frame description constant to redzone[1].
   Value *BasePlus1 = IRB.CreateIntToPtr(
-    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize/8)),
-    IntptrPtrTy);
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
+      IntptrPtrTy);
   GlobalVariable *StackDescriptionGlobal =
       createPrivateGlobalForString(*F.getParent(), L.DescriptionString,
-                                   /*AllowMerging*/true);
-  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal,
-                                             IntptrTy);
+                                   /*AllowMerging*/ true);
+  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
   IRB.CreateStore(Description, BasePlus1);
   // Write the PC to redzone[2].
   Value *BasePlus2 = IRB.CreateIntToPtr(
-    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy,
-                                                   2 * ASan.LongSize/8)),
-    IntptrPtrTy);
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
+      IntptrPtrTy);
   IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
 
   // Poison the stack redzones at the entry.
@@ -1830,8 +1909,7 @@ void FunctionStackPoisoner::poisonStack() {
   }
 
   // We are done. Remove the old unused alloca instructions.
-  for (auto AI : AllocaVec)
-    AI->eraseFromParent();
+  for (auto AI : AllocaVec) AI->eraseFromParent();
 }
 
 void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
@@ -1839,9 +1917,9 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
   // For now just insert the call to ASan runtime.
   Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
   Value *SizeArg = ConstantInt::get(IntptrTy, Size);
-  IRB.CreateCall2(DoPoison ? AsanPoisonStackMemoryFunc
-                           : AsanUnpoisonStackMemoryFunc,
-                  AddrArg, SizeArg);
+  IRB.CreateCall2(
+      DoPoison ? AsanPoisonStackMemoryFunc : AsanUnpoisonStackMemoryFunc,
+      AddrArg, SizeArg);
 }
 
 // Handling llvm.lifetime intrinsics for a given %alloca:
@@ -1856,12 +1934,11 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
 AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V))
     // We're intested only in allocas we can handle.
-    return isInterestingAlloca(*AI) ? AI : nullptr;
+    return ASan.isInterestingAlloca(*AI) ? AI : nullptr;
   // See if we've already calculated (or started to calculate) alloca for a
   // given value.
   AllocaForValueMapTy::iterator I = AllocaForValue.find(V);
-  if (I != AllocaForValue.end())
-    return I->second;
+  if (I != AllocaForValue.end()) return I->second;
   // Store 0 while we're calculating alloca for value V to avoid
   // infinite recursion if the value references itself.
   AllocaForValue[V] = nullptr;
@@ -1880,8 +1957,7 @@ AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
       Res = IncValueAI;
     }
   }
-  if (Res)
-    AllocaForValue[V] = Res;
+  if (Res) AllocaForValue[V] = Res;
   return Res;
 }
 
@@ -1912,14 +1988,14 @@ Value *FunctionStackPoisoner::computePartialRzMagic(Value *PartialSize,
   Value *Shift = IRB.CreateAnd(PartialSize, IRB.getInt32(~7));
   unsigned Val1Int = kAsanAllocaPartialVal1;
   unsigned Val2Int = kAsanAllocaPartialVal2;
-  if (!ASan.DL->isLittleEndian()) {
+  if (!F.getParent()->getDataLayout().isLittleEndian()) {
     Val1Int = sys::getSwappedBytes(Val1Int);
     Val2Int = sys::getSwappedBytes(Val2Int);
   }
   Value *Val1 = shiftAllocaMagic(IRB.getInt32(Val1Int), IRB, Shift);
   Value *PartialBits = IRB.CreateAnd(PartialSize, IRB.getInt32(7));
   // For BigEndian get 0x000000YZ -> 0xYZ000000.
-  if (ASan.DL->isBigEndian())
+  if (F.getParent()->getDataLayout().isBigEndian())
     PartialBits = IRB.CreateShl(PartialBits, IRB.getInt32(24));
   Value *Val2 = IRB.getInt32(Val2Int);
   Value *Cond =
@@ -1953,7 +2029,8 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(
   // redzones, and OldSize is number of allocated blocks with
   // ElementSize size, get allocated memory size in bytes by
   // OldSize * ElementSize.
-  unsigned ElementSize = ASan.DL->getTypeAllocSize(AI->getAllocatedType());
+  unsigned ElementSize =
+      F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType());
   Value *OldSize = IRB.CreateMul(AI->getArraySize(),
                                  ConstantInt::get(IntptrTy, ElementSize));
 
@@ -2021,3 +2098,20 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(
   AI->eraseFromParent();
   NumInstrumentedDynamicAllocas++;
 }
+
+// isSafeAccess returns true if Addr is always inbounds with respect to its
+// base object. For example, it is a field access or an array access with
+// constant inbounds index.
+bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                    Value *Addr, uint64_t TypeSize) const {
+  SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr);
+  if (!ObjSizeVis.bothKnown(SizeOffset)) return false;
+  uint64_t Size = SizeOffset.first.getZExtValue();
+  int64_t Offset = SizeOffset.second.getSExtValue();
+  // Three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  return Offset >= 0 && Size >= uint64_t(Offset) &&
+         Size - uint64_t(Offset) >= TypeSize / 8;
+}
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 2b5f39c..8113834 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -24,7 +25,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "bounds-checking"
@@ -49,12 +49,10 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<DataLayoutPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
 
   private:
-    const DataLayout *DL;
     const TargetLibraryInfo *TLI;
     ObjectSizeOffsetEvaluator *ObjSizeEval;
     BuilderTy *Builder;
@@ -63,7 +61,7 @@ namespace {
 
     BasicBlock *getTrapBB();
     void emitBranchToTrap(Value *Cmp = nullptr);
-    bool instrument(Value *Ptr, Value *Val);
+    bool instrument(Value *Ptr, Value *Val, const DataLayout &DL);
  };
 }
 
@@ -125,8 +123,9 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) {
 /// result from the load or the value being stored. It is used to determine the
 /// size of memory block that is touched.
 /// Returns true if any change was made to the IR, false otherwise.
-bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
-  uint64_t NeededSize = DL->getTypeStoreSize(InstVal->getType());
+bool BoundsChecking::instrument(Value *Ptr, Value *InstVal,
+                                const DataLayout &DL) {
+  uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
   DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
               << " bytes\n");
 
@@ -141,7 +140,7 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
   Value *Offset = SizeOffset.second;
   ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
 
-  Type *IntTy = DL->getIntPtrType(Ptr->getType());
+  Type *IntTy = DL.getIntPtrType(Ptr->getType());
   Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
 
   // three checks are required to ensure safety:
@@ -165,7 +164,7 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
 }
 
 bool BoundsChecking::runOnFunction(Function &F) {
-  DL = &getAnalysis<DataLayoutPass>().getDataLayout();
+  const DataLayout &DL = F.getParent()->getDataLayout();
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   TrapBB = nullptr;
@@ -192,13 +191,16 @@ bool BoundsChecking::runOnFunction(Function &F) {
 
     Builder->SetInsertPoint(Inst);
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-      MadeChange |= instrument(LI->getPointerOperand(), LI);
+      MadeChange |= instrument(LI->getPointerOperand(), LI, DL);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      MadeChange |= instrument(SI->getPointerOperand(), SI->getValueOperand());
+      MadeChange |=
+          instrument(SI->getPointerOperand(), SI->getValueOperand(), DL);
     } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
-      MadeChange |= instrument(AI->getPointerOperand(),AI->getCompareOperand());
+      MadeChange |=
+          instrument(AI->getPointerOperand(), AI->getCompareOperand(), DL);
     } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) {
-      MadeChange |= instrument(AI->getPointerOperand(), AI->getValOperand());
+      MadeChange |=
+          instrument(AI->getPointerOperand(), AI->getValOperand(), DL);
     } else {
       llvm_unreachable("unknown Instruction type");
     }
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 6adf0d2..b3925ee 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -217,7 +217,6 @@ class DataFlowSanitizer : public ModulePass {
     WK_Custom
   };
 
-  const DataLayout *DL;
   Module *Mod;
   LLVMContext *Ctx;
   IntegerType *ShadowTy;
@@ -422,16 +421,13 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
   bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||
                   TargetTriple.getArch() == llvm::Triple::mips64el;
 
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP)
-    report_fatal_error("data layout missing");
-  DL = &DLP->getDataLayout();
+  const DataLayout &DL = M.getDataLayout();
 
   Mod = &M;
   Ctx = &M.getContext();
   ShadowTy = IntegerType::get(*Ctx, ShadowWidth);
   ShadowPtrTy = PointerType::getUnqual(ShadowTy);
-  IntptrTy = DL->getIntPtrType(*Ctx);
+  IntptrTy = DL.getIntPtrType(*Ctx);
   ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
   ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
   if (IsX86_64)
@@ -593,9 +589,6 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
 }
 
 bool DataFlowSanitizer::runOnModule(Module &M) {
-  if (!DL)
-    return false;
-
   if (ABIList.isIn(M, "skip"))
     return false;
 
@@ -1056,7 +1049,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
 
   uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
   SmallVector<Value *, 2> Objs;
-  GetUnderlyingObjects(Addr, Objs, DFS.DL);
+  GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout());
   bool AllConstants = true;
   for (SmallVector<Value *, 2>::iterator i = Objs.begin(), e = Objs.end();
        i != e; ++i) {
@@ -1157,7 +1150,8 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
 }
 
 void DFSanVisitor::visitLoadInst(LoadInst &LI) {
-  uint64_t Size = DFSF.DFS.DL->getTypeStoreSize(LI.getType());
+  auto &DL = LI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(LI.getType());
   if (Size == 0) {
     DFSF.setShadow(&LI, DFSF.DFS.ZeroShadow);
     return;
@@ -1167,7 +1161,7 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) {
   if (ClPreserveAlignment) {
     Align = LI.getAlignment();
     if (Align == 0)
-      Align = DFSF.DFS.DL->getABITypeAlignment(LI.getType());
+      Align = DL.getABITypeAlignment(LI.getType());
   } else {
     Align = 1;
   }
@@ -1235,8 +1229,8 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
 }
 
 void DFSanVisitor::visitStoreInst(StoreInst &SI) {
-  uint64_t Size =
-      DFSF.DFS.DL->getTypeStoreSize(SI.getValueOperand()->getType());
+  auto &DL = SI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType());
   if (Size == 0)
     return;
 
@@ -1244,7 +1238,7 @@ void DFSanVisitor::visitStoreInst(StoreInst &SI) {
   if (ClPreserveAlignment) {
     Align = SI.getAlignment();
     if (Align == 0)
-      Align = DFSF.DFS.DL->getABITypeAlignment(SI.getValueOperand()->getType());
+      Align = DL.getABITypeAlignment(SI.getValueOperand()->getType());
   } else {
     Align = 1;
   }
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index cb965fb..a793e69 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -47,6 +47,8 @@ using namespace llvm;
 static cl::opt<std::string>
 DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
                    cl::ValueRequired);
+static cl::opt<bool> DefaultExitBlockBeforeBody("gcov-exit-block-before-body",
+                                                cl::init(false), cl::Hidden);
 
 GCOVOptions GCOVOptions::getDefault() {
   GCOVOptions Options;
@@ -55,6 +57,7 @@ GCOVOptions GCOVOptions::getDefault() {
   Options.UseCfgChecksum = false;
   Options.NoRedZone = false;
   Options.FunctionNamesInData = true;
+  Options.ExitBlockBeforeBody = DefaultExitBlockBeforeBody;
 
   if (DefaultGCOVVersion.size() != 4) {
     llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
@@ -70,20 +73,10 @@ namespace {
   class GCOVProfiler : public ModulePass {
   public:
     static char ID;
-    GCOVProfiler() : ModulePass(ID), Options(GCOVOptions::getDefault()) {
-      init();
-    }
-    GCOVProfiler(const GCOVOptions &Options) : ModulePass(ID), Options(Options){
+    GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
+    GCOVProfiler(const GCOVOptions &Opts) : ModulePass(ID), Options(Opts) {
       assert((Options.EmitNotes || Options.EmitData) &&
              "GCOVProfiler asked to do nothing?");
-      init();
-    }
-    const char *getPassName() const override {
-      return "GCOV Profiler";
-    }
-
-  private:
-    void init() {
       ReversedVersion[0] = Options.Version[3];
       ReversedVersion[1] = Options.Version[2];
       ReversedVersion[2] = Options.Version[1];
@@ -91,6 +84,11 @@ namespace {
       ReversedVersion[4] = '\0';
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
+    const char *getPassName() const override {
+      return "GCOV Profiler";
+    }
+
+  private:
     bool runOnModule(Module &M) override;
 
     // Create the .gcno files for the Module based on DebugInfo.
@@ -312,7 +310,7 @@ namespace {
   class GCOVFunction : public GCOVRecord {
    public:
      GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident,
-                  bool UseCfgChecksum)
+                  bool UseCfgChecksum, bool ExitBlockBeforeBody)
          : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0),
            ReturnBlock(1, os) {
       this->os = os;
@@ -322,11 +320,13 @@ namespace {
 
       uint32_t i = 0;
       for (auto &BB : *F) {
-        // Skip index 1 (0, 2, 3, 4, ...) because that's assigned to the
-        // ReturnBlock.
-        bool first = i == 0;
-        Blocks.insert(std::make_pair(&BB, GCOVBlock(i++ + !first, os)));
+        // Skip index 1 if it's assigned to the ReturnBlock.
+        if (i == 1 && ExitBlockBeforeBody)
+          ++i;
+        Blocks.insert(std::make_pair(&BB, GCOVBlock(i++, os)));
       }
+      if (!ExitBlockBeforeBody)
+        ReturnBlock.Number = i;
 
       std::string FunctionNameAndLine;
       raw_string_ostream FNLOS(FunctionNameAndLine);
@@ -469,7 +469,7 @@ static bool functionHasLines(Function *F) {
       if (Loc.isUnknown()) continue;
 
       // Artificial lines such as calls to the global constructors.
-      if (Loc.getLine() == 0) continue; 
+      if (Loc.getLine() == 0) continue;
 
       return true;
     }
@@ -513,7 +513,8 @@ void GCOVProfiler::emitProfileNotes() {
       EntryBlock.splitBasicBlock(It);
 
       Funcs.push_back(make_unique<GCOVFunction>(SP, &out, FunctionIdent++,
-                                                Options.UseCfgChecksum));
+                                                Options.UseCfgChecksum,
+                                                Options.ExitBlockBeforeBody));
       GCOVFunction &Func = *Funcs.back();
 
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4152679..c2aa1e2 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -274,7 +274,6 @@ class MemorySanitizer : public FunctionPass {
   MemorySanitizer(int TrackOrigins = 0)
       : FunctionPass(ID),
         TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)),
-        DL(nullptr),
         WarningFn(nullptr) {}
   const char *getPassName() const override { return "MemorySanitizer"; }
   bool runOnFunction(Function &F) override;
@@ -287,7 +286,6 @@ class MemorySanitizer : public FunctionPass {
   /// \brief Track origins (allocation points) of uninitialized values.
   int TrackOrigins;
 
-  const DataLayout *DL;
   LLVMContext *C;
   Type *IntptrTy;
   Type *OriginTy;
@@ -449,10 +447,7 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
 ///
 /// inserts a call to __msan_init to the module's constructor list.
 bool MemorySanitizer::doInitialization(Module &M) {
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP)
-    report_fatal_error("data layout missing");
-  DL = &DLP->getDataLayout();
+  auto &DL = M.getDataLayout();
 
   Triple TargetTriple(M.getTargetTriple());
   switch (TargetTriple.getOS()) {
@@ -604,7 +599,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) {
-    unsigned IntptrSize = MS.DL->getTypeStoreSize(MS.IntptrTy);
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
     if (IntptrSize == kOriginSize) return Origin;
     assert(IntptrSize == kOriginSize * 2);
     Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false);
@@ -614,8 +610,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// \brief Fill memory range with the given origin value.
   void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
                    unsigned Size, unsigned Alignment) {
-    unsigned IntptrAlignment = MS.DL->getABITypeAlignment(MS.IntptrTy);
-    unsigned IntptrSize = MS.DL->getTypeStoreSize(MS.IntptrTy);
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned IntptrAlignment = DL.getABITypeAlignment(MS.IntptrTy);
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
     assert(IntptrAlignment >= kMinOriginAlignment);
     assert(IntptrSize >= kOriginSize);
 
@@ -643,8 +640,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
                    unsigned Alignment, bool AsCall) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
     unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
-    unsigned StoreSize = MS.DL->getTypeStoreSize(Shadow->getType());
+    unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
     if (isa<StructType>(Shadow->getType())) {
       paintOrigin(IRB, updateOrigin(Origin, IRB),
                   getOriginPtr(Addr, IRB, Alignment), StoreSize,
@@ -661,7 +659,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       }
 
       unsigned TypeSizeInBits =
-          MS.DL->getTypeSizeInBits(ConvertedShadow->getType());
+          DL.getTypeSizeInBits(ConvertedShadow->getType());
       unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
       if (AsCall && SizeIndex < kNumberOfAccessSizes) {
         Value *Fn = MS.MaybeStoreOriginFn[SizeIndex];
@@ -731,8 +729,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return;
     }
 
-    unsigned TypeSizeInBits =
-        MS.DL->getTypeSizeInBits(ConvertedShadow->getType());
+    const DataLayout &DL = OrigIns->getModule()->getDataLayout();
+
+    unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
     unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
     if (AsCall && SizeIndex < kNumberOfAccessSizes) {
       Value *Fn = MS.MaybeWarningFn[SizeIndex];
@@ -772,7 +771,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// \brief Add MemorySanitizer instrumentation to a function.
   bool runOnFunction() {
     MS.initializeCallbacks(*F.getParent());
-    if (!MS.DL) return false;
 
     // In the presence of unreachable blocks, we may see Phi nodes with
     // incoming nodes from such blocks. Since InstVisitor skips unreachable
@@ -828,8 +826,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // This may return weird-sized types like i1.
     if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
       return IT;
+    const DataLayout &DL = F.getParent()->getDataLayout();
     if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
-      uint32_t EltSize = MS.DL->getTypeSizeInBits(VT->getElementType());
+      uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
       return VectorType::get(IntegerType::get(*MS.C, EltSize),
                              VT->getNumElements());
     }
@@ -845,7 +844,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
       return Res;
     }
-    uint32_t TypeSize = MS.DL->getTypeSizeInBits(OrigTy);
+    uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
     return IntegerType::get(*MS.C, TypeSize);
   }
 
@@ -1038,14 +1037,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Function *F = A->getParent();
       IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
       unsigned ArgOffset = 0;
+      const DataLayout &DL = F->getParent()->getDataLayout();
       for (auto &FArg : F->args()) {
         if (!FArg.getType()->isSized()) {
           DEBUG(dbgs() << "Arg is not sized\n");
           continue;
         }
-        unsigned Size = FArg.hasByValAttr()
-          ? MS.DL->getTypeAllocSize(FArg.getType()->getPointerElementType())
-          : MS.DL->getTypeAllocSize(FArg.getType());
+        unsigned Size =
+            FArg.hasByValAttr()
+                ? DL.getTypeAllocSize(FArg.getType()->getPointerElementType())
+                : DL.getTypeAllocSize(FArg.getType());
         if (A == &FArg) {
           bool Overflow = ArgOffset + Size > kParamTLSSize;
           Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
@@ -1056,7 +1057,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             unsigned ArgAlign = FArg.getParamAlignment();
             if (ArgAlign == 0) {
               Type *EltType = A->getType()->getPointerElementType();
-              ArgAlign = MS.DL->getABITypeAlignment(EltType);
+              ArgAlign = DL.getABITypeAlignment(EltType);
             }
             if (Overflow) {
               // ParamTLS overflow.
@@ -2427,10 +2428,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       DEBUG(dbgs() << "  Arg#" << i << ": " << *A <<
             " Shadow: " << *ArgShadow << "\n");
       bool ArgIsInitialized = false;
+      const DataLayout &DL = F.getParent()->getDataLayout();
       if (CS.paramHasAttr(i + 1, Attribute::ByVal)) {
         assert(A->getType()->isPointerTy() &&
                "ByVal argument is not a pointer!");
-        Size = MS.DL->getTypeAllocSize(A->getType()->getPointerElementType());
+        Size = DL.getTypeAllocSize(A->getType()->getPointerElementType());
         if (ArgOffset + Size > kParamTLSSize) break;
         unsigned ParamAlignment = CS.getParamAlignment(i + 1);
         unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment);
@@ -2438,7 +2440,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                  getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB),
                                  Size, Alignment);
       } else {
-        Size = MS.DL->getTypeAllocSize(A->getType());
+        Size = DL.getTypeAllocSize(A->getType());
         if (ArgOffset + Size > kParamTLSSize) break;
         Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
                                        kShadowTLSAlignment);
@@ -2531,7 +2533,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setShadow(&I, getCleanShadow(&I));
     setOrigin(&I, getCleanOrigin());
     IRBuilder<> IRB(I.getNextNode());
-    uint64_t Size = MS.DL->getTypeAllocSize(I.getAllocatedType());
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    uint64_t Size = DL.getTypeAllocSize(I.getAllocatedType());
     if (PoisonStack && ClPoisonStackWithCall) {
       IRB.CreateCall2(MS.MsanPoisonStackFn,
                       IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
@@ -2723,6 +2726,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
     unsigned GpOffset = 0;
     unsigned FpOffset = AMD64GpEndOffset;
     unsigned OverflowOffset = AMD64FpEndOffset;
+    const DataLayout &DL = F.getParent()->getDataLayout();
     for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
          ArgIt != End; ++ArgIt) {
       Value *A = *ArgIt;
@@ -2732,7 +2736,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
         // ByVal arguments always go to the overflow area.
         assert(A->getType()->isPointerTy());
         Type *RealTy = A->getType()->getPointerElementType();
-        uint64_t ArgSize = MS.DL->getTypeAllocSize(RealTy);
+        uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
         Value *Base = getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset);
         OverflowOffset += RoundUpToAlignment(ArgSize, 8);
         IRB.CreateMemCpy(Base, MSV.getShadowPtr(A, IRB.getInt8Ty(), IRB),
@@ -2754,7 +2758,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
             FpOffset += 16;
             break;
           case AK_Memory:
-            uint64_t ArgSize = MS.DL->getTypeAllocSize(A->getType());
+            uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
             Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset);
             OverflowOffset += RoundUpToAlignment(ArgSize, 8);
         }
@@ -2862,11 +2866,12 @@ struct VarArgMIPS64Helper : public VarArgHelper {
 
   void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
     unsigned VAArgOffset = 0;
+    const DataLayout &DL = F.getParent()->getDataLayout();
     for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end();
          ArgIt != End; ++ArgIt) {
       Value *A = *ArgIt;
       Value *Base;
-      uint64_t ArgSize = MS.DL->getTypeAllocSize(A->getType());
+      uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
 #if defined(__MIPSEB__) || defined(MIPSEB)
       // Adjusting the shadow for argument with size < 8 to match the placement
       // of bits in big endian system
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 8c56e87..289675e 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -59,6 +59,7 @@ static const char *const kSanCovWithCheckName = "__sanitizer_cov_with_check";
 static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16";
 static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter";
 static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block";
+static const char *const kSanCovTraceCmp = "__sanitizer_cov_trace_cmp";
 static const char *const kSanCovModuleCtorName = "sancov.module_ctor";
 static const uint64_t    kSanCtorAndDtorPriority = 2;
 
@@ -72,7 +73,7 @@ static cl::opt<unsigned> ClCoverageBlockThreshold(
     "sanitizer-coverage-block-threshold",
     cl::desc("Use a callback with a guard check inside it if there are"
              " more than this number of blocks."),
-    cl::Hidden, cl::init(1000));
+    cl::Hidden, cl::init(500));
 
 static cl::opt<bool>
     ClExperimentalTracing("sanitizer-coverage-experimental-tracing",
@@ -80,6 +81,22 @@ static cl::opt<bool>
                                    "callbacks at every basic block"),
                           cl::Hidden, cl::init(false));
 
+static cl::opt<bool>
+    ClExperimentalCMPTracing("sanitizer-coverage-experimental-trace-compares",
+                             cl::desc("Experimental tracing of CMP and similar "
+                                      "instructions"),
+                             cl::Hidden, cl::init(false));
+
+// Experimental 8-bit counters used as an additional search heuristic during
+// coverage-guided fuzzing.
+// The counters are not thread-friendly:
+//   - contention on these counters may cause significant slowdown;
+//   - the counter updates are racy and the results may be inaccurate.
+// They are also inaccurate due to 8-bit integer overflow.
+static cl::opt<bool> ClUse8bitCounters("sanitizer-coverage-8bit-counters",
+                                       cl::desc("Experimental 8-bit counters"),
+                                       cl::Hidden, cl::init(false));
+
 namespace {
 
 class SanitizerCoverageModule : public ModulePass {
@@ -94,26 +111,29 @@ class SanitizerCoverageModule : public ModulePass {
     return "SanitizerCoverageModule";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DataLayoutPass>();
-  }
-
  private:
   void InjectCoverageForIndirectCalls(Function &F,
                                       ArrayRef<Instruction *> IndirCalls);
-  bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks,
-                      ArrayRef<Instruction *> IndirCalls);
+  void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
+  bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+  void SetNoSanitizeMetada(Instruction *I);
   void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls);
+  unsigned NumberOfInstrumentedBlocks() {
+    return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses();
+  }
   Function *SanCovFunction;
   Function *SanCovWithCheckFunction;
   Function *SanCovIndirCallFunction;
   Function *SanCovModuleInit;
   Function *SanCovTraceEnter, *SanCovTraceBB;
+  Function *SanCovTraceCmpFunction;
   InlineAsm *EmptyAsm;
-  Type *IntptrTy;
+  Type *IntptrTy, *Int64Ty;
   LLVMContext *C;
+  const DataLayout *DL;
 
   GlobalVariable *GuardArray;
+  GlobalVariable *EightBitCounterArray;
 
   int CoverageLevel;
 };
@@ -133,12 +153,13 @@ static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
 bool SanitizerCoverageModule::runOnModule(Module &M) {
   if (!CoverageLevel) return false;
   C = &(M.getContext());
-  DataLayoutPass *DLP = &getAnalysis<DataLayoutPass>();
-  IntptrTy = Type::getIntNTy(*C, DLP->getDataLayout().getPointerSizeInBits());
+  DL = &M.getDataLayout();
+  IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
   Type *VoidTy = Type::getVoidTy(*C);
   IRBuilder<> IRB(*C);
   Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
   Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  Int64Ty = IRB.getInt64Ty();
 
   Function *CtorFunc =
       Function::Create(FunctionType::get(VoidTy, false),
@@ -152,9 +173,12 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
       M.getOrInsertFunction(kSanCovWithCheckName, VoidTy, Int32PtrTy, nullptr));
   SanCovIndirCallFunction = checkInterfaceFunction(M.getOrInsertFunction(
       kSanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
-  SanCovModuleInit = checkInterfaceFunction(
-      M.getOrInsertFunction(kSanCovModuleInitName, Type::getVoidTy(*C),
-                            Int32PtrTy, IntptrTy, Int8PtrTy, nullptr));
+  SanCovTraceCmpFunction = checkInterfaceFunction(M.getOrInsertFunction(
+      kSanCovTraceCmp, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr));
+
+  SanCovModuleInit = checkInterfaceFunction(M.getOrInsertFunction(
+      kSanCovModuleInitName, Type::getVoidTy(*C), Int32PtrTy, IntptrTy,
+      Int8PtrTy, Int8PtrTy, nullptr));
   SanCovModuleInit->setLinkage(Function::ExternalLinkage);
   // We insert an empty inline asm after cov callbacks to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
@@ -171,26 +195,49 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   // At this point we create a dummy array of guards because we don't
   // know how many elements we will need.
   Type *Int32Ty = IRB.getInt32Ty();
+  Type *Int8Ty = IRB.getInt8Ty();
+
   GuardArray =
       new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
                          nullptr, "__sancov_gen_cov_tmp");
+  if (ClUse8bitCounters)
+    EightBitCounterArray =
+        new GlobalVariable(M, Int8Ty, false, GlobalVariable::ExternalLinkage,
+                           nullptr, "__sancov_gen_cov_tmp");
 
   for (auto &F : M)
     runOnFunction(F);
 
+  auto N = NumberOfInstrumentedBlocks();
+
   // Now we know how many elements we need. Create an array of guards
   // with one extra element at the beginning for the size.
-  Type *Int32ArrayNTy =
-      ArrayType::get(Int32Ty, SanCovFunction->getNumUses() + 1);
+  Type *Int32ArrayNTy = ArrayType::get(Int32Ty, N + 1);
   GlobalVariable *RealGuardArray = new GlobalVariable(
       M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage,
       Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov");
 
+
   // Replace the dummy array with the real one.
   GuardArray->replaceAllUsesWith(
       IRB.CreatePointerCast(RealGuardArray, Int32PtrTy));
   GuardArray->eraseFromParent();
 
+  GlobalVariable *RealEightBitCounterArray;
+  if (ClUse8bitCounters) {
+    // Make sure the array is 16-aligned.
+    static const int kCounterAlignment = 16;
+    Type *Int8ArrayNTy =
+        ArrayType::get(Int8Ty, RoundUpToAlignment(N, kCounterAlignment));
+    RealEightBitCounterArray = new GlobalVariable(
+        M, Int8ArrayNTy, false, GlobalValue::PrivateLinkage,
+        Constant::getNullValue(Int8ArrayNTy), "__sancov_gen_cov_counter");
+    RealEightBitCounterArray->setAlignment(kCounterAlignment);
+    EightBitCounterArray->replaceAllUsesWith(
+        IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy));
+    EightBitCounterArray->eraseFromParent();
+  }
+
   // Create variable for module (compilation unit) name
   Constant *ModNameStrConst =
       ConstantDataArray::getString(M.getContext(), M.getName(), true);
@@ -200,10 +247,13 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
 
   // Call __sanitizer_cov_module_init
   IRB.SetInsertPoint(CtorFunc->getEntryBlock().getTerminator());
-  IRB.CreateCall3(SanCovModuleInit,
-                  IRB.CreatePointerCast(RealGuardArray, Int32PtrTy),
-                  ConstantInt::get(IntptrTy, SanCovFunction->getNumUses()),
-                  IRB.CreatePointerCast(ModuleName, Int8PtrTy));
+  IRB.CreateCall4(
+      SanCovModuleInit, IRB.CreatePointerCast(RealGuardArray, Int32PtrTy),
+      ConstantInt::get(IntptrTy, N),
+      ClUse8bitCounters
+          ? IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy)
+          : Constant::getNullValue(Int8PtrTy),
+      IRB.CreatePointerCast(ModuleName, Int8PtrTy));
   return true;
 }
 
@@ -215,23 +265,28 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
     SplitAllCriticalEdges(F);
   SmallVector<Instruction*, 8> IndirCalls;
   SmallVector<BasicBlock*, 16> AllBlocks;
+  SmallVector<Instruction*, 8> CmpTraceTargets;
   for (auto &BB : F) {
     AllBlocks.push_back(&BB);
-    if (CoverageLevel >= 4)
-      for (auto &Inst : BB) {
+    for (auto &Inst : BB) {
+      if (CoverageLevel >= 4) {
         CallSite CS(&Inst);
         if (CS && !CS.getCalledFunction())
           IndirCalls.push_back(&Inst);
       }
+      if (ClExperimentalCMPTracing)
+        if (isa<ICmpInst>(&Inst))
+          CmpTraceTargets.push_back(&Inst);
+    }
   }
-  InjectCoverage(F, AllBlocks, IndirCalls);
+  InjectCoverage(F, AllBlocks);
+  InjectCoverageForIndirectCalls(F, IndirCalls);
+  InjectTraceForCmp(F, CmpTraceTargets);
   return true;
 }
 
-bool
-SanitizerCoverageModule::InjectCoverage(Function &F,
-                                        ArrayRef<BasicBlock *> AllBlocks,
-                                        ArrayRef<Instruction *> IndirCalls) {
+bool SanitizerCoverageModule::InjectCoverage(Function &F,
+                                             ArrayRef<BasicBlock *> AllBlocks) {
   if (!CoverageLevel) return false;
 
   if (CoverageLevel == 1) {
@@ -241,7 +296,6 @@ SanitizerCoverageModule::InjectCoverage(Function &F,
       InjectCoverageAtBlock(F, *BB,
                             ClCoverageBlockThreshold < AllBlocks.size());
   }
-  InjectCoverageForIndirectCalls(F, IndirCalls);
   return true;
 }
 
@@ -273,6 +327,32 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
   }
 }
 
+void SanitizerCoverageModule::InjectTraceForCmp(
+    Function &F, ArrayRef<Instruction *> CmpTraceTargets) {
+  if (!ClExperimentalCMPTracing) return;
+  for (auto I : CmpTraceTargets) {
+    if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
+      IRBuilder<> IRB(ICMP);
+      Value *A0 = ICMP->getOperand(0);
+      Value *A1 = ICMP->getOperand(1);
+      if (!A0->getType()->isIntegerTy()) continue;
+      uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
+      // __sanitizer_cov_indir_call((type_size << 32) | predicate, A0, A1);
+      IRB.CreateCall3(
+          SanCovTraceCmpFunction,
+          ConstantInt::get(Int64Ty, (TypeSize << 32) | ICMP->getPredicate()),
+          IRB.CreateIntCast(A0, Int64Ty, true),
+          IRB.CreateIntCast(A1, Int64Ty, true));
+    }
+  }
+}
+
+void SanitizerCoverageModule::SetNoSanitizeMetada(Instruction *I) {
+  I->setMetadata(
+      I->getParent()->getParent()->getParent()->getMDKindID("nosanitize"),
+      MDNode::get(*C, None));
+}
+
 void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
                                                     bool UseCalls) {
   BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end();
@@ -286,14 +366,15 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   }
 
   bool IsEntryBB = &BB == &F.getEntryBlock();
-  DebugLoc EntryLoc =
-      IsEntryBB ? IP->getDebugLoc().getFnDebugLoc(*C) : IP->getDebugLoc();
+  DebugLoc EntryLoc = IsEntryBB && !IP->getDebugLoc().isUnknown()
+                          ? IP->getDebugLoc().getFnDebugLoc(*C)
+                          : IP->getDebugLoc();
   IRBuilder<> IRB(IP);
   IRB.SetCurrentDebugLocation(EntryLoc);
   SmallVector<Value *, 1> Indices;
   Value *GuardP = IRB.CreateAdd(
       IRB.CreatePointerCast(GuardArray, IntptrTy),
-      ConstantInt::get(IntptrTy, (1 + SanCovFunction->getNumUses()) * 4));
+      ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));
   Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
   GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
   if (UseCalls) {
@@ -302,8 +383,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     LoadInst *Load = IRB.CreateLoad(GuardP);
     Load->setAtomic(Monotonic);
     Load->setAlignment(4);
-    Load->setMetadata(F.getParent()->getMDKindID("nosanitize"),
-                      MDNode::get(*C, None));
+    SetNoSanitizeMetada(Load);
     Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);
     Instruction *Ins = SplitBlockAndInsertIfThen(
         Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
@@ -314,6 +394,19 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     IRB.CreateCall(EmptyAsm);  // Avoids callback merge.
   }
 
+  if(ClUse8bitCounters) {
+    IRB.SetInsertPoint(IP);
+    Value *P = IRB.CreateAdd(
+        IRB.CreatePointerCast(EightBitCounterArray, IntptrTy),
+        ConstantInt::get(IntptrTy, NumberOfInstrumentedBlocks() - 1));
+    P = IRB.CreateIntToPtr(P, IRB.getInt8PtrTy());
+    LoadInst *LI = IRB.CreateLoad(P);
+    Value *Inc = IRB.CreateAdd(LI, ConstantInt::get(IRB.getInt8Ty(), 1));
+    StoreInst *SI = IRB.CreateStore(Inc, P);
+    SetNoSanitizeMetada(LI);
+    SetNoSanitizeMetada(SI);
+  }
+
   if (ClExperimentalTracing) {
     // Experimental support for tracing.
     // Insert a callback with the same guard variable as used for coverage.
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index e4a4911..c3ba722 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -19,14 +19,14 @@
 // The rest is handled by the run-time library.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -76,7 +76,7 @@ namespace {
 
 /// ThreadSanitizer: instrument the code in module to find races.
 struct ThreadSanitizer : public FunctionPass {
-  ThreadSanitizer() : FunctionPass(ID), DL(nullptr) {}
+  ThreadSanitizer() : FunctionPass(ID) {}
   const char *getPassName() const override;
   bool runOnFunction(Function &F) override;
   bool doInitialization(Module &M) override;
@@ -84,15 +84,15 @@ struct ThreadSanitizer : public FunctionPass {
 
  private:
   void initializeCallbacks(Module &M);
-  bool instrumentLoadOrStore(Instruction *I);
-  bool instrumentAtomic(Instruction *I);
+  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
+  bool instrumentAtomic(Instruction *I, const DataLayout &DL);
   bool instrumentMemIntrinsic(Instruction *I);
-  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
-                                      SmallVectorImpl<Instruction*> &All);
+  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local,
+                                      SmallVectorImpl<Instruction *> &All,
+                                      const DataLayout &DL);
   bool addrPointsToConstantData(Value *Addr);
-  int getMemoryAccessFuncIndex(Value *Addr);
+  int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
 
-  const DataLayout *DL;
   Type *IntptrTy;
   IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
@@ -230,10 +230,7 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
 }
 
 bool ThreadSanitizer::doInitialization(Module &M) {
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP)
-    report_fatal_error("data layout missing");
-  DL = &DLP->getDataLayout();
+  const DataLayout &DL = M.getDataLayout();
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -285,8 +282,8 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
 // 'Local' is a vector of insns within the same BB (no calls between).
 // 'All' is a vector of insns that will be instrumented.
 void ThreadSanitizer::chooseInstructionsToInstrument(
-    SmallVectorImpl<Instruction*> &Local,
-    SmallVectorImpl<Instruction*> &All) {
+    SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All,
+    const DataLayout &DL) {
   SmallSet<Value*, 8> WriteTargets;
   // Iterate from the end.
   for (SmallVectorImpl<Instruction*>::reverse_iterator It = Local.rbegin(),
@@ -310,7 +307,7 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
     Value *Addr = isa<StoreInst>(*I)
         ? cast<StoreInst>(I)->getPointerOperand()
         : cast<LoadInst>(I)->getPointerOperand();
-    if (isa<AllocaInst>(GetUnderlyingObject(Addr, nullptr)) &&
+    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
         !PointerMayBeCaptured(Addr, true, true)) {
       // The variable is addressable but not captured, so it cannot be
       // referenced from a different thread and participate in a data race
@@ -338,7 +335,6 @@ static bool isAtomic(Instruction *I) {
 }
 
 bool ThreadSanitizer::runOnFunction(Function &F) {
-  if (!DL) return false;
   initializeCallbacks(*F.getParent());
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> AllLoadsAndStores;
@@ -348,6 +344,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   bool Res = false;
   bool HasCalls = false;
   bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
+  const DataLayout &DL = F.getParent()->getDataLayout();
 
   // Traverse all instructions, collect loads/stores/returns, check for calls.
   for (auto &BB : F) {
@@ -362,10 +359,11 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
         if (isa<MemIntrinsic>(Inst))
           MemIntrinCalls.push_back(&Inst);
         HasCalls = true;
-        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores,
+                                       DL);
       }
     }
-    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL);
   }
 
   // We have collected all loads and stores.
@@ -375,14 +373,14 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   // Instrument memory accesses only if we want to report bugs in the function.
   if (ClInstrumentMemoryAccesses && SanitizeFunction)
     for (auto Inst : AllLoadsAndStores) {
-      Res |= instrumentLoadOrStore(Inst);
+      Res |= instrumentLoadOrStore(Inst, DL);
     }
 
   // Instrument atomic memory accesses in any case (they can be used to
   // implement synchronization).
   if (ClInstrumentAtomics)
     for (auto Inst : AtomicAccesses) {
-      Res |= instrumentAtomic(Inst);
+      Res |= instrumentAtomic(Inst, DL);
     }
 
   if (ClInstrumentMemIntrinsics && SanitizeFunction)
@@ -406,13 +404,14 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   return Res;
 }
 
-bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
+bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
+                                            const DataLayout &DL) {
   IRBuilder<> IRB(I);
   bool IsWrite = isa<StoreInst>(*I);
   Value *Addr = IsWrite
       ? cast<StoreInst>(I)->getPointerOperand()
       : cast<LoadInst>(I)->getPointerOperand();
-  int Idx = getMemoryAccessFuncIndex(Addr);
+  int Idx = getMemoryAccessFuncIndex(Addr, DL);
   if (Idx < 0)
     return false;
   if (IsWrite && isVtableAccess(I)) {
@@ -443,7 +442,7 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
       ? cast<StoreInst>(I)->getAlignment()
       : cast<LoadInst>(I)->getAlignment();
   Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
-  const uint32_t TypeSize = DL->getTypeStoreSizeInBits(OrigTy);
+  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
   Value *OnAccessFunc = nullptr;
   if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0)
     OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
@@ -504,11 +503,11 @@ bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
 // The following page contains more background information:
 // http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
 
-bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
+bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
   IRBuilder<> IRB(I);
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     Value *Addr = LI->getPointerOperand();
-    int Idx = getMemoryAccessFuncIndex(Addr);
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
     if (Idx < 0)
       return false;
     const size_t ByteSize = 1 << Idx;
@@ -522,7 +521,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
 
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     Value *Addr = SI->getPointerOperand();
-    int Idx = getMemoryAccessFuncIndex(Addr);
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
     if (Idx < 0)
       return false;
     const size_t ByteSize = 1 << Idx;
@@ -536,7 +535,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
     ReplaceInstWithInst(I, C);
   } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
     Value *Addr = RMWI->getPointerOperand();
-    int Idx = getMemoryAccessFuncIndex(Addr);
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
     if (Idx < 0)
       return false;
     Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];
@@ -553,7 +552,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
     ReplaceInstWithInst(I, C);
   } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
     Value *Addr = CASI->getPointerOperand();
-    int Idx = getMemoryAccessFuncIndex(Addr);
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
     if (Idx < 0)
       return false;
     const size_t ByteSize = 1 << Idx;
@@ -583,11 +582,12 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
   return true;
 }
 
-int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) {
+int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr,
+                                              const DataLayout &DL) {
   Type *OrigPtrTy = Addr->getType();
   Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
   assert(OrigTy->isSized());
-  uint32_t TypeSize = DL->getTypeStoreSizeInBits(OrigTy);
+  uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
   if (TypeSize != 8  && TypeSize != 16 &&
       TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
     NumAccessesWithBadSize++;
diff --git a/lib/Transforms/ObjCARC/ARCInstKind.cpp b/lib/Transforms/ObjCARC/ARCInstKind.cpp
index f1e9dce..72df9ab 100644
--- a/lib/Transforms/ObjCARC/ARCInstKind.cpp
+++ b/lib/Transforms/ObjCARC/ARCInstKind.cpp
@@ -168,6 +168,60 @@ ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) {
   return ARCInstKind::CallOrUser;
 }
 
+// A whitelist of intrinsics that we know do not use objc pointers or decrement
+// ref counts.
+static bool isInertIntrinsic(unsigned ID) {
+  // TODO: Make this into a covered switch.
+  switch (ID) {
+  case Intrinsic::returnaddress:
+  case Intrinsic::frameaddress:
+  case Intrinsic::stacksave:
+  case Intrinsic::stackrestore:
+  case Intrinsic::vastart:
+  case Intrinsic::vacopy:
+  case Intrinsic::vaend:
+  case Intrinsic::objectsize:
+  case Intrinsic::prefetch:
+  case Intrinsic::stackprotector:
+  case Intrinsic::eh_return_i32:
+  case Intrinsic::eh_return_i64:
+  case Intrinsic::eh_typeid_for:
+  case Intrinsic::eh_dwarf_cfa:
+  case Intrinsic::eh_sjlj_lsda:
+  case Intrinsic::eh_sjlj_functioncontext:
+  case Intrinsic::init_trampoline:
+  case Intrinsic::adjust_trampoline:
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+  case Intrinsic::invariant_start:
+  case Intrinsic::invariant_end:
+  // Don't let dbg info affect our results.
+  case Intrinsic::dbg_declare:
+  case Intrinsic::dbg_value:
+    // Short cut: Some intrinsics obviously don't use ObjC pointers.
+    return true;
+  default:
+    return false;
+  }
+}
+
+// A whitelist of intrinsics that we know do not use objc pointers or decrement
+// ref counts.
+static bool isUseOnlyIntrinsic(unsigned ID) {
+  // We are conservative and even though intrinsics are unlikely to touch
+  // reference counts, we white list them for safety.
+  //
+  // TODO: Expand this into a covered switch. There is a lot more here.
+  switch (ID) {
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove:
+  case Intrinsic::memset:
+    return true;
+  default:
+    return false;
+  }
+}
+
 /// \brief Determine what kind of construct V is.
 ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) {
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
@@ -180,49 +234,23 @@ ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) {
     switch (I->getOpcode()) {
     case Instruction::Call: {
       const CallInst *CI = cast<CallInst>(I);
-      // Check for calls to special functions.
+      // See if we have a function that we know something about.
       if (const Function *F = CI->getCalledFunction()) {
         ARCInstKind Class = GetFunctionClass(F);
         if (Class != ARCInstKind::CallOrUser)
           return Class;
-
-        // None of the intrinsic functions do objc_release. For intrinsics, the
-        // only question is whether or not they may be users.
-        switch (F->getIntrinsicID()) {
-        case Intrinsic::returnaddress:
-        case Intrinsic::frameaddress:
-        case Intrinsic::stacksave:
-        case Intrinsic::stackrestore:
-        case Intrinsic::vastart:
-        case Intrinsic::vacopy:
-        case Intrinsic::vaend:
-        case Intrinsic::objectsize:
-        case Intrinsic::prefetch:
-        case Intrinsic::stackprotector:
-        case Intrinsic::eh_return_i32:
-        case Intrinsic::eh_return_i64:
-        case Intrinsic::eh_typeid_for:
-        case Intrinsic::eh_dwarf_cfa:
-        case Intrinsic::eh_sjlj_lsda:
-        case Intrinsic::eh_sjlj_functioncontext:
-        case Intrinsic::init_trampoline:
-        case Intrinsic::adjust_trampoline:
-        case Intrinsic::lifetime_start:
-        case Intrinsic::lifetime_end:
-        case Intrinsic::invariant_start:
-        case Intrinsic::invariant_end:
-        // Don't let dbg info affect our results.
-        case Intrinsic::dbg_declare:
-        case Intrinsic::dbg_value:
-          // Short cut: Some intrinsics obviously don't use ObjC pointers.
+        unsigned ID = F->getIntrinsicID();
+        if (isInertIntrinsic(ID))
           return ARCInstKind::None;
-        default:
-          break;
-        }
+        if (isUseOnlyIntrinsic(ID))
+          return ARCInstKind::User;
       }
+
+      // Otherwise, be conservative.
       return GetCallSiteClass(CI);
     }
     case Instruction::Invoke:
+      // Otherwise, be conservative.
       return GetCallSiteClass(cast<InvokeInst>(I));
     case Instruction::BitCast:
     case Instruction::GetElementPtr:
diff --git a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index e286dbc..87de33b 100644
--- a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -27,22 +27,22 @@
 namespace llvm {
 namespace objcarc {
 
+enum class ARCRuntimeEntryPointKind {
+  AutoreleaseRV,
+  Release,
+  Retain,
+  RetainBlock,
+  Autorelease,
+  StoreStrong,
+  RetainRV,
+  RetainAutorelease,
+  RetainAutoreleaseRV,
+};
+
 /// Declarations for ObjC runtime functions and constants. These are initialized
 /// lazily to avoid cluttering up the Module with unused declarations.
 class ARCRuntimeEntryPoints {
 public:
-  enum EntryPointType {
-    EPT_AutoreleaseRV,
-    EPT_Release,
-    EPT_Retain,
-    EPT_RetainBlock,
-    EPT_Autorelease,
-    EPT_StoreStrong,
-    EPT_RetainRV,
-    EPT_RetainAutorelease,
-    EPT_RetainAutoreleaseRV
-  };
-
   ARCRuntimeEntryPoints() : TheModule(nullptr),
                             AutoreleaseRV(nullptr),
                             Release(nullptr),
@@ -56,7 +56,7 @@ public:
 
   ~ARCRuntimeEntryPoints() { }
 
-  void Initialize(Module *M) {
+  void init(Module *M) {
     TheModule = M;
     AutoreleaseRV = nullptr;
     Release = nullptr;
@@ -69,30 +69,30 @@ public:
     RetainAutoreleaseRV = nullptr;
   }
 
-  Constant *get(const EntryPointType entry) {
+  Constant *get(ARCRuntimeEntryPointKind kind) {
     assert(TheModule != nullptr && "Not initialized.");
 
-    switch (entry) {
-    case EPT_AutoreleaseRV:
+    switch (kind) {
+    case ARCRuntimeEntryPointKind::AutoreleaseRV:
       return getI8XRetI8XEntryPoint(AutoreleaseRV,
                                     "objc_autoreleaseReturnValue", true);
-    case EPT_Release:
+    case ARCRuntimeEntryPointKind::Release:
       return getVoidRetI8XEntryPoint(Release, "objc_release");
-    case EPT_Retain:
+    case ARCRuntimeEntryPointKind::Retain:
       return getI8XRetI8XEntryPoint(Retain, "objc_retain", true);
-    case EPT_RetainBlock:
+    case ARCRuntimeEntryPointKind::RetainBlock:
       return getI8XRetI8XEntryPoint(RetainBlock, "objc_retainBlock", false);
-    case EPT_Autorelease:
+    case ARCRuntimeEntryPointKind::Autorelease:
       return getI8XRetI8XEntryPoint(Autorelease, "objc_autorelease", true);
-    case EPT_StoreStrong:
+    case ARCRuntimeEntryPointKind::StoreStrong:
       return getI8XRetI8XXI8XEntryPoint(StoreStrong, "objc_storeStrong");
-    case EPT_RetainRV:
+    case ARCRuntimeEntryPointKind::RetainRV:
       return getI8XRetI8XEntryPoint(RetainRV,
                                     "objc_retainAutoreleasedReturnValue", true);
-    case EPT_RetainAutorelease:
+    case ARCRuntimeEntryPointKind::RetainAutorelease:
       return getI8XRetI8XEntryPoint(RetainAutorelease, "objc_retainAutorelease",
                                     true);
-    case EPT_RetainAutoreleaseRV:
+    case ARCRuntimeEntryPointKind::RetainAutoreleaseRV:
       return getI8XRetI8XEntryPoint(RetainAutoreleaseRV,
                                     "objc_retainAutoreleaseReturnValue", true);
     }
diff --git a/lib/Transforms/ObjCARC/Android.mk b/lib/Transforms/ObjCARC/Android.mk
index 97c5a9d..e120fbe 100644
--- a/lib/Transforms/ObjCARC/Android.mk
+++ b/lib/Transforms/ObjCARC/Android.mk
@@ -9,6 +9,7 @@ transforms_objcarc_SRC_FILES := \
   ObjCARC.cpp \
   ObjCARCExpand.cpp \
   ObjCARCOpts.cpp \
+  PtrState.cpp \
   ProvenanceAnalysis.cpp \
   ProvenanceAnalysisEvaluator.cpp
 
diff --git a/lib/Transforms/ObjCARC/BlotMapVector.h b/lib/Transforms/ObjCARC/BlotMapVector.h
new file mode 100644
index 0000000..d6439b6
--- /dev/null
+++ b/lib/Transforms/ObjCARC/BlotMapVector.h
@@ -0,0 +1,108 @@
+//===- BlotMapVector.h - A MapVector with the blot operation -*- C++ -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include <vector>
+#include <algorithm>
+
+namespace llvm {
+/// \brief An associative container with fast insertion-order (deterministic)
+/// iteration over its elements. Plus the special blot operation.
+template <class KeyT, class ValueT> class BlotMapVector {
+  /// Map keys to indices in Vector.
+  typedef DenseMap<KeyT, size_t> MapTy;
+  MapTy Map;
+
+  typedef std::vector<std::pair<KeyT, ValueT>> VectorTy;
+  /// Keys and values.
+  VectorTy Vector;
+
+public:
+  typedef typename VectorTy::iterator iterator;
+  typedef typename VectorTy::const_iterator const_iterator;
+  iterator begin() { return Vector.begin(); }
+  iterator end() { return Vector.end(); }
+  const_iterator begin() const { return Vector.begin(); }
+  const_iterator end() const { return Vector.end(); }
+
+#ifdef XDEBUG
+  ~BlotMapVector() {
+    assert(Vector.size() >= Map.size()); // May differ due to blotting.
+    for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); I != E;
+         ++I) {
+      assert(I->second < Vector.size());
+      assert(Vector[I->second].first == I->first);
+    }
+    for (typename VectorTy::const_iterator I = Vector.begin(), E = Vector.end();
+         I != E; ++I)
+      assert(!I->first || (Map.count(I->first) &&
+                           Map[I->first] == size_t(I - Vector.begin())));
+  }
+#endif
+
+  ValueT &operator[](const KeyT &Arg) {
+    std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(Arg, size_t(0)));
+    if (Pair.second) {
+      size_t Num = Vector.size();
+      Pair.first->second = Num;
+      Vector.push_back(std::make_pair(Arg, ValueT()));
+      return Vector[Num].second;
+    }
+    return Vector[Pair.first->second].second;
+  }
+
+  std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &InsertPair) {
+    std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
+    if (Pair.second) {
+      size_t Num = Vector.size();
+      Pair.first->second = Num;
+      Vector.push_back(InsertPair);
+      return std::make_pair(Vector.begin() + Num, true);
+    }
+    return std::make_pair(Vector.begin() + Pair.first->second, false);
+  }
+
+  iterator find(const KeyT &Key) {
+    typename MapTy::iterator It = Map.find(Key);
+    if (It == Map.end())
+      return Vector.end();
+    return Vector.begin() + It->second;
+  }
+
+  const_iterator find(const KeyT &Key) const {
+    typename MapTy::const_iterator It = Map.find(Key);
+    if (It == Map.end())
+      return Vector.end();
+    return Vector.begin() + It->second;
+  }
+
+  /// This is similar to erase, but instead of removing the element from the
+  /// vector, it just zeros out the key in the vector. This leaves iterators
+  /// intact, but clients must be prepared for zeroed-out keys when iterating.
+  void blot(const KeyT &Key) {
+    typename MapTy::iterator It = Map.find(Key);
+    if (It == Map.end())
+      return;
+    Vector[It->second].first = KeyT();
+    Map.erase(It);
+  }
+
+  void clear() {
+    Map.clear();
+    Vector.clear();
+  }
+
+  bool empty() const {
+    assert(Map.empty() == Vector.empty());
+    return Map.empty();
+  }
+};
+} //
diff --git a/lib/Transforms/ObjCARC/CMakeLists.txt b/lib/Transforms/ObjCARC/CMakeLists.txt
index 2adea88..fbcae29 100644
--- a/lib/Transforms/ObjCARC/CMakeLists.txt
+++ b/lib/Transforms/ObjCARC/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMObjCARCOpts
   DependencyAnalysis.cpp
   ProvenanceAnalysis.cpp
   ProvenanceAnalysisEvaluator.cpp
+  PtrState.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 4985d0e..b197c97 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -53,10 +53,12 @@ bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
   if (AliasAnalysis::onlyReadsMemory(MRB))
     return false;
   if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+    const DataLayout &DL = Inst->getModule()->getDataLayout();
     for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
          I != E; ++I) {
       const Value *Op = *I;
-      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
+          PA.related(Ptr, Op, DL))
         return true;
     }
     return false;
@@ -87,6 +89,8 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
   if (Class == ARCInstKind::Call)
     return false;
 
+  const DataLayout &DL = Inst->getModule()->getDataLayout();
+
   // Consider various instructions which may have pointer arguments which are
   // not "uses".
   if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
@@ -100,24 +104,26 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
     for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
          OE = CS.arg_end(); OI != OE; ++OI) {
       const Value *Op = *OI;
-      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
+          PA.related(Ptr, Op, DL))
         return true;
     }
     return false;
   } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
     // Special-case stores, because we don't care about the stored value, just
     // the store address.
-    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
+    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand(), DL);
     // If we can't tell what the underlying object was, assume there is a
     // dependence.
-    return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Op, Ptr);
+    return IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
+           PA.related(Op, Ptr, DL);
   }
 
   // Check each operand for a match.
   for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
        OI != OE; ++OI) {
     const Value *Op = *OI;
-    if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+    if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op, DL))
       return true;
   }
   return false;
diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
index df29f05..7595e2d 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/lib/Transforms/ObjCARC/ObjCARC.h
@@ -24,6 +24,7 @@
 #define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
 
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -72,9 +73,10 @@ static inline bool ModuleHasARC(const Module &M) {
 /// \brief This is a wrapper around getUnderlyingObject which also knows how to
 /// look through objc_retain and objc_autorelease calls, which we know to return
 /// their argument verbatim.
-static inline const Value *GetUnderlyingObjCPtr(const Value *V) {
+static inline const Value *GetUnderlyingObjCPtr(const Value *V,
+                                                const DataLayout &DL) {
   for (;;) {
-    V = GetUnderlyingObject(V);
+    V = GetUnderlyingObject(V, DL);
     if (!IsForwarding(GetBasicARCInstKind(V)))
       break;
     V = cast<CallInst>(V)->getArgOperand(0);
@@ -257,6 +259,55 @@ static inline bool IsObjCIdentifiedObject(const Value *V) {
   return false;
 }
 
+enum class ARCMDKindID {
+  ImpreciseRelease,
+  CopyOnEscape,
+  NoObjCARCExceptions,
+};
+
+/// A cache of MDKinds used by various ARC optimizations.
+class ARCMDKindCache {
+  Module *M;
+
+  /// The Metadata Kind for clang.imprecise_release metadata.
+  llvm::Optional<unsigned> ImpreciseReleaseMDKind;
+
+  /// The Metadata Kind for clang.arc.copy_on_escape metadata.
+  llvm::Optional<unsigned> CopyOnEscapeMDKind;
+
+  /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata.
+  llvm::Optional<unsigned> NoObjCARCExceptionsMDKind;
+
+public:
+  void init(Module *Mod) {
+    M = Mod;
+    ImpreciseReleaseMDKind = NoneType::None;
+    CopyOnEscapeMDKind = NoneType::None;
+    NoObjCARCExceptionsMDKind = NoneType::None;
+  }
+
+  unsigned get(ARCMDKindID ID) {
+    switch (ID) {
+    case ARCMDKindID::ImpreciseRelease:
+      if (!ImpreciseReleaseMDKind)
+        ImpreciseReleaseMDKind =
+            M->getContext().getMDKindID("clang.imprecise_release");
+      return *ImpreciseReleaseMDKind;
+    case ARCMDKindID::CopyOnEscape:
+      if (!CopyOnEscapeMDKind)
+        CopyOnEscapeMDKind =
+            M->getContext().getMDKindID("clang.arc.copy_on_escape");
+      return *CopyOnEscapeMDKind;
+    case ARCMDKindID::NoObjCARCExceptions:
+      if (!NoObjCARCExceptionsMDKind)
+        NoObjCARCExceptionsMDKind =
+            M->getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
+      return *NoObjCARCExceptionsMDKind;
+    }
+    llvm_unreachable("Covered switch isn't covered?!");
+  }
+};
+
 } // end namespace objcarc
 } // end namespace llvm
 
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
index be291a0..b1515e3 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
@@ -46,6 +46,11 @@ ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
   return new ObjCARCAliasAnalysis();
 }
 
+bool ObjCARCAliasAnalysis::doInitialization(Module &M) {
+  InitializeAliasAnalysis(this, &M.getDataLayout());
+  return true;
+}
+
 void
 ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
@@ -69,8 +74,8 @@ ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
 
   // If that failed, climb to the underlying object, including climbing through
   // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *UA = GetUnderlyingObjCPtr(SA);
-  const Value *UB = GetUnderlyingObjCPtr(SB);
+  const Value *UA = GetUnderlyingObjCPtr(SA, *DL);
+  const Value *UB = GetUnderlyingObjCPtr(SB, *DL);
   if (UA != SA || UB != SB) {
     Result = AliasAnalysis::alias(Location(UA), Location(UB));
     // We can't use MustAlias or PartialAlias results here because
@@ -99,7 +104,7 @@ ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
 
   // If that failed, climb to the underlying object, including climbing through
   // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *U = GetUnderlyingObjCPtr(S);
+  const Value *U = GetUnderlyingObjCPtr(S, *DL);
   if (U != S)
     return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal);
 
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
index 3fcea4e..3c5a021 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
@@ -44,9 +44,7 @@ namespace objcarc {
     }
 
   private:
-    void initializePass() override {
-      InitializeAliasAnalysis(this);
-    }
+    bool doInitialization(Module &M) override;
 
     /// This method is used when a pass implements an analysis interface through
     /// multiple inheritance.  If needed, it should override this to adjust the
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 6473d3a..2a3139f 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::objcarc;
@@ -134,7 +135,7 @@ bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
 
   // We do not have to worry about tail calls/does not throw since
   // retain/retainRV have the same properties.
-  Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_RetainRV);
+  Constant *Decl = EP.get(ARCRuntimeEntryPointKind::RetainRV);
   cast<CallInst>(Retain)->setCalledFunction(Decl);
 
   DEBUG(dbgs() << "New: " << *Retain << "\n");
@@ -181,8 +182,8 @@ bool ObjCARCContract::contractAutorelease(
                   "        Retain: " << *Retain << "\n");
 
   Constant *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV
-                              ? ARCRuntimeEntryPoints::EPT_RetainAutoreleaseRV
-                              : ARCRuntimeEntryPoints::EPT_RetainAutorelease);
+                              ? ARCRuntimeEntryPointKind::RetainAutoreleaseRV
+                              : ARCRuntimeEntryPointKind::RetainAutorelease);
   Retain->setCalledFunction(Decl);
 
   DEBUG(dbgs() << "        New RetainAutorelease: " << *Retain << "\n");
@@ -380,7 +381,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
     Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
   if (Args[1]->getType() != I8X)
     Args[1] = new BitCastInst(Args[1], I8X, "", Store);
-  Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_StoreStrong);
+  Constant *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong);
   CallInst *StoreStrong = CallInst::Create(Decl, Args, "", Store);
   StoreStrong->setDoesNotThrow();
   StoreStrong->setDebugLoc(Store->getDebugLoc());
@@ -647,7 +648,7 @@ bool ObjCARCContract::doInitialization(Module &M) {
   if (!Run)
     return false;
 
-  EP.Initialize(&M);
+  EP.init(&M);
 
   // Initialize RetainRVMarker.
   RetainRVMarker = nullptr;
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index f55b77f..4d75658 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -26,9 +26,11 @@
 
 #include "ObjCARC.h"
 #include "ARCRuntimeEntryPoints.h"
+#include "BlotMapVector.h"
 #include "DependencyAnalysis.h"
 #include "ObjCARCAliasAnalysis.h"
 #include "ProvenanceAnalysis.h"
+#include "PtrState.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
@@ -45,102 +47,6 @@ using namespace llvm::objcarc;
 
 #define DEBUG_TYPE "objc-arc-opts"
 
-/// \defgroup MiscUtils Miscellaneous utilities that are not ARC specific.
-/// @{
-
-namespace {
-  /// \brief An associative container with fast insertion-order (deterministic)
-  /// iteration over its elements. Plus the special blot operation.
-  template<class KeyT, class ValueT>
-  class MapVector {
-    /// Map keys to indices in Vector.
-    typedef DenseMap<KeyT, size_t> MapTy;
-    MapTy Map;
-
-    typedef std::vector<std::pair<KeyT, ValueT> > VectorTy;
-    /// Keys and values.
-    VectorTy Vector;
-
-  public:
-    typedef typename VectorTy::iterator iterator;
-    typedef typename VectorTy::const_iterator const_iterator;
-    iterator begin() { return Vector.begin(); }
-    iterator end() { return Vector.end(); }
-    const_iterator begin() const { return Vector.begin(); }
-    const_iterator end() const { return Vector.end(); }
-
-#ifdef XDEBUG
-    ~MapVector() {
-      assert(Vector.size() >= Map.size()); // May differ due to blotting.
-      for (typename MapTy::const_iterator I = Map.begin(), E = Map.end();
-           I != E; ++I) {
-        assert(I->second < Vector.size());
-        assert(Vector[I->second].first == I->first);
-      }
-      for (typename VectorTy::const_iterator I = Vector.begin(),
-           E = Vector.end(); I != E; ++I)
-        assert(!I->first ||
-               (Map.count(I->first) &&
-                Map[I->first] == size_t(I - Vector.begin())));
-    }
-#endif
-
-    ValueT &operator[](const KeyT &Arg) {
-      std::pair<typename MapTy::iterator, bool> Pair =
-        Map.insert(std::make_pair(Arg, size_t(0)));
-      if (Pair.second) {
-        size_t Num = Vector.size();
-        Pair.first->second = Num;
-        Vector.push_back(std::make_pair(Arg, ValueT()));
-        return Vector[Num].second;
-      }
-      return Vector[Pair.first->second].second;
-    }
-
-    std::pair<iterator, bool>
-    insert(const std::pair<KeyT, ValueT> &InsertPair) {
-      std::pair<typename MapTy::iterator, bool> Pair =
-        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
-      if (Pair.second) {
-        size_t Num = Vector.size();
-        Pair.first->second = Num;
-        Vector.push_back(InsertPair);
-        return std::make_pair(Vector.begin() + Num, true);
-      }
-      return std::make_pair(Vector.begin() + Pair.first->second, false);
-    }
-
-    iterator find(const KeyT &Key) {
-      typename MapTy::iterator It = Map.find(Key);
-      if (It == Map.end()) return Vector.end();
-      return Vector.begin() + It->second;
-    }
-
-    const_iterator find(const KeyT &Key) const {
-      typename MapTy::const_iterator It = Map.find(Key);
-      if (It == Map.end()) return Vector.end();
-      return Vector.begin() + It->second;
-    }
-
-    /// This is similar to erase, but instead of removing the element from the
-    /// vector, it just zeros out the key in the vector. This leaves iterators
-    /// intact, but clients must be prepared for zeroed-out keys when iterating.
-    void blot(const KeyT &Key) {
-      typename MapTy::iterator It = Map.find(Key);
-      if (It == Map.end()) return;
-      Vector[It->second].first = KeyT();
-      Map.erase(It);
-    }
-
-    void clear() {
-      Map.clear();
-      Vector.clear();
-    }
-  };
-}
-
-/// @}
-///
 /// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
 /// @{
 
@@ -177,13 +83,14 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
 /// This is a wrapper around getUnderlyingObjCPtr along the lines of
 /// GetUnderlyingObjects except that it returns early when it sees the first
 /// alloca.
-static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V) {
+static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V,
+                                                   const DataLayout &DL) {
   SmallPtrSet<const Value *, 4> Visited;
   SmallVector<const Value *, 4> Worklist;
   Worklist.push_back(V);
   do {
     const Value *P = Worklist.pop_back_val();
-    P = GetUnderlyingObjCPtr(P);
+    P = GetUnderlyingObjCPtr(P, DL);
 
     if (isa<AllocaInst>(P))
       return true;
@@ -270,293 +177,6 @@ STATISTIC(NumReleasesAfterOpt,
 #endif
 
 namespace {
-  /// \enum Sequence
-  ///
-  /// \brief A sequence of states that a pointer may go through in which an
-  /// objc_retain and objc_release are actually needed.
-  enum Sequence {
-    S_None,
-    S_Retain,         ///< objc_retain(x).
-    S_CanRelease,     ///< foo(x) -- x could possibly see a ref count decrement.
-    S_Use,            ///< any use of x.
-    S_Stop,           ///< like S_Release, but code motion is stopped.
-    S_Release,        ///< objc_release(x).
-    S_MovableRelease  ///< objc_release(x), !clang.imprecise_release.
-  };
-
-  raw_ostream &operator<<(raw_ostream &OS, const Sequence S)
-    LLVM_ATTRIBUTE_UNUSED;
-  raw_ostream &operator<<(raw_ostream &OS, const Sequence S) {
-    switch (S) {
-    case S_None:
-      return OS << "S_None";
-    case S_Retain:
-      return OS << "S_Retain";
-    case S_CanRelease:
-      return OS << "S_CanRelease";
-    case S_Use:
-      return OS << "S_Use";
-    case S_Release:
-      return OS << "S_Release";
-    case S_MovableRelease:
-      return OS << "S_MovableRelease";
-    case S_Stop:
-      return OS << "S_Stop";
-    }
-    llvm_unreachable("Unknown sequence type.");
-  }
-}
-
-static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
-  // The easy cases.
-  if (A == B)
-    return A;
-  if (A == S_None || B == S_None)
-    return S_None;
-
-  if (A > B) std::swap(A, B);
-  if (TopDown) {
-    // Choose the side which is further along in the sequence.
-    if ((A == S_Retain || A == S_CanRelease) &&
-        (B == S_CanRelease || B == S_Use))
-      return B;
-  } else {
-    // Choose the side which is further along in the sequence.
-    if ((A == S_Use || A == S_CanRelease) &&
-        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
-      return A;
-    // If both sides are releases, choose the more conservative one.
-    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
-      return A;
-    if (A == S_Release && B == S_MovableRelease)
-      return A;
-  }
-
-  return S_None;
-}
-
-namespace {
-  /// \brief Unidirectional information about either a
-  /// retain-decrement-use-release sequence or release-use-decrement-retain
-  /// reverse sequence.
-  struct RRInfo {
-    /// After an objc_retain, the reference count of the referenced
-    /// object is known to be positive. Similarly, before an objc_release, the
-    /// reference count of the referenced object is known to be positive. If
-    /// there are retain-release pairs in code regions where the retain count
-    /// is known to be positive, they can be eliminated, regardless of any side
-    /// effects between them.
-    ///
-    /// Also, a retain+release pair nested within another retain+release
-    /// pair all on the known same pointer value can be eliminated, regardless
-    /// of any intervening side effects.
-    ///
-    /// KnownSafe is true when either of these conditions is satisfied.
-    bool KnownSafe;
-
-    /// True of the objc_release calls are all marked with the "tail" keyword.
-    bool IsTailCallRelease;
-
-    /// If the Calls are objc_release calls and they all have a
-    /// clang.imprecise_release tag, this is the metadata tag.
-    MDNode *ReleaseMetadata;
-
-    /// For a top-down sequence, the set of objc_retains or
-    /// objc_retainBlocks. For bottom-up, the set of objc_releases.
-    SmallPtrSet<Instruction *, 2> Calls;
-
-    /// The set of optimal insert positions for moving calls in the opposite
-    /// sequence.
-    SmallPtrSet<Instruction *, 2> ReverseInsertPts;
-
-    /// If this is true, we cannot perform code motion but can still remove
-    /// retain/release pairs.
-    bool CFGHazardAfflicted;
-
-    RRInfo() :
-      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(nullptr),
-      CFGHazardAfflicted(false) {}
-
-    void clear();
-
-    /// Conservatively merge the two RRInfo. Returns true if a partial merge has
-    /// occurred, false otherwise.
-    bool Merge(const RRInfo &Other);
-
-  };
-}
-
-void RRInfo::clear() {
-  KnownSafe = false;
-  IsTailCallRelease = false;
-  ReleaseMetadata = nullptr;
-  Calls.clear();
-  ReverseInsertPts.clear();
-  CFGHazardAfflicted = false;
-}
-
-bool RRInfo::Merge(const RRInfo &Other) {
-    // Conservatively merge the ReleaseMetadata information.
-    if (ReleaseMetadata != Other.ReleaseMetadata)
-      ReleaseMetadata = nullptr;
-
-    // Conservatively merge the boolean state.
-    KnownSafe &= Other.KnownSafe;
-    IsTailCallRelease &= Other.IsTailCallRelease;
-    CFGHazardAfflicted |= Other.CFGHazardAfflicted;
-
-    // Merge the call sets.
-    Calls.insert(Other.Calls.begin(), Other.Calls.end());
-
-    // Merge the insert point sets. If there are any differences,
-    // that makes this a partial merge.
-    bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size();
-    for (Instruction *Inst : Other.ReverseInsertPts)
-      Partial |= ReverseInsertPts.insert(Inst).second;
-    return Partial;
-}
-
-namespace {
-  /// \brief This class summarizes several per-pointer runtime properties which
-  /// are propogated through the flow graph.
-  class PtrState {
-    /// True if the reference count is known to be incremented.
-    bool KnownPositiveRefCount;
-
-    /// True if we've seen an opportunity for partial RR elimination, such as
-    /// pushing calls into a CFG triangle or into one side of a CFG diamond.
-    bool Partial;
-
-    /// The current position in the sequence.
-    unsigned char Seq : 8;
-
-    /// Unidirectional information about the current sequence.
-    RRInfo RRI;
-
-  public:
-    PtrState() : KnownPositiveRefCount(false), Partial(false),
-                 Seq(S_None) {}
-
-
-    bool IsKnownSafe() const {
-      return RRI.KnownSafe;
-    }
-
-    void SetKnownSafe(const bool NewValue) {
-      RRI.KnownSafe = NewValue;
-    }
-
-    bool IsTailCallRelease() const {
-      return RRI.IsTailCallRelease;
-    }
-
-    void SetTailCallRelease(const bool NewValue) {
-      RRI.IsTailCallRelease = NewValue;
-    }
-
-    bool IsTrackingImpreciseReleases() const {
-      return RRI.ReleaseMetadata != nullptr;
-    }
-
-    const MDNode *GetReleaseMetadata() const {
-      return RRI.ReleaseMetadata;
-    }
-
-    void SetReleaseMetadata(MDNode *NewValue) {
-      RRI.ReleaseMetadata = NewValue;
-    }
-
-    bool IsCFGHazardAfflicted() const {
-      return RRI.CFGHazardAfflicted;
-    }
-
-    void SetCFGHazardAfflicted(const bool NewValue) {
-      RRI.CFGHazardAfflicted = NewValue;
-    }
-
-    void SetKnownPositiveRefCount() {
-      DEBUG(dbgs() << "Setting Known Positive.\n");
-      KnownPositiveRefCount = true;
-    }
-
-    void ClearKnownPositiveRefCount() {
-      DEBUG(dbgs() << "Clearing Known Positive.\n");
-      KnownPositiveRefCount = false;
-    }
-
-    bool HasKnownPositiveRefCount() const {
-      return KnownPositiveRefCount;
-    }
-
-    void SetSeq(Sequence NewSeq) {
-      DEBUG(dbgs() << "Old: " << Seq << "; New: " << NewSeq << "\n");
-      Seq = NewSeq;
-    }
-
-    Sequence GetSeq() const {
-      return static_cast<Sequence>(Seq);
-    }
-
-    void ClearSequenceProgress() {
-      ResetSequenceProgress(S_None);
-    }
-
-    void ResetSequenceProgress(Sequence NewSeq) {
-      DEBUG(dbgs() << "Resetting sequence progress.\n");
-      SetSeq(NewSeq);
-      Partial = false;
-      RRI.clear();
-    }
-
-    void Merge(const PtrState &Other, bool TopDown);
-
-    void InsertCall(Instruction *I) {
-      RRI.Calls.insert(I);
-    }
-
-    void InsertReverseInsertPt(Instruction *I) {
-      RRI.ReverseInsertPts.insert(I);
-    }
-
-    void ClearReverseInsertPts() {
-      RRI.ReverseInsertPts.clear();
-    }
-
-    bool HasReverseInsertPts() const {
-      return !RRI.ReverseInsertPts.empty();
-    }
-
-    const RRInfo &GetRRInfo() const {
-      return RRI;
-    }
-  };
-}
-
-void
-PtrState::Merge(const PtrState &Other, bool TopDown) {
-  Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown);
-  KnownPositiveRefCount &= Other.KnownPositiveRefCount;
-
-  // If we're not in a sequence (anymore), drop all associated state.
-  if (Seq == S_None) {
-    Partial = false;
-    RRI.clear();
-  } else if (Partial || Other.Partial) {
-    // If we're doing a merge on a path that's previously seen a partial
-    // merge, conservatively drop the sequence, to avoid doing partial
-    // RR elimination. If the branch predicates for the two merge differ,
-    // mixing them is unsafe.
-    ClearSequenceProgress();
-  } else {
-    // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this
-    // point, we know that currently we are not partial. Stash whether or not
-    // the merge operation caused us to undergo a partial merging of reverse
-    // insertion points.
-    Partial = RRI.Merge(Other.RRI);
-  }
-}
-
-namespace {
   /// \brief Per-BasicBlock state.
   class BBState {
     /// The number of unique control paths from the entry which can reach this
@@ -566,20 +186,18 @@ namespace {
     /// The number of unique control paths to exits from this block.
     unsigned BottomUpPathCount;
 
-    /// A type for PerPtrTopDown and PerPtrBottomUp.
-    typedef MapVector<const Value *, PtrState> MapTy;
-
     /// The top-down traversal uses this to record information known about a
     /// pointer at the bottom of each block.
-    MapTy PerPtrTopDown;
+    BlotMapVector<const Value *, TopDownPtrState> PerPtrTopDown;
 
     /// The bottom-up traversal uses this to record information known about a
     /// pointer at the top of each block.
-    MapTy PerPtrBottomUp;
+    BlotMapVector<const Value *, BottomUpPtrState> PerPtrBottomUp;
 
     /// Effective predecessors of the current block ignoring ignorable edges and
     /// ignored backedges.
     SmallVector<BasicBlock *, 2> Preds;
+
     /// Effective successors of the current block ignoring ignorable edges and
     /// ignored backedges.
     SmallVector<BasicBlock *, 2> Succs;
@@ -589,26 +207,38 @@ namespace {
 
     BBState() : TopDownPathCount(0), BottomUpPathCount(0) { }
 
-    typedef MapTy::iterator ptr_iterator;
-    typedef MapTy::const_iterator ptr_const_iterator;
+    typedef decltype(PerPtrTopDown)::iterator top_down_ptr_iterator;
+    typedef decltype(PerPtrTopDown)::const_iterator const_top_down_ptr_iterator;
 
-    ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
-    ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
-    ptr_const_iterator top_down_ptr_begin() const {
+    top_down_ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
+    top_down_ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
+    const_top_down_ptr_iterator top_down_ptr_begin() const {
       return PerPtrTopDown.begin();
     }
-    ptr_const_iterator top_down_ptr_end() const {
+    const_top_down_ptr_iterator top_down_ptr_end() const {
       return PerPtrTopDown.end();
     }
+    bool hasTopDownPtrs() const {
+      return !PerPtrTopDown.empty();
+    }
+
+    typedef decltype(PerPtrBottomUp)::iterator bottom_up_ptr_iterator;
+    typedef decltype(
+        PerPtrBottomUp)::const_iterator const_bottom_up_ptr_iterator;
 
-    ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); }
-    ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
-    ptr_const_iterator bottom_up_ptr_begin() const {
+    bottom_up_ptr_iterator bottom_up_ptr_begin() {
       return PerPtrBottomUp.begin();
     }
-    ptr_const_iterator bottom_up_ptr_end() const {
+    bottom_up_ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
+    const_bottom_up_ptr_iterator bottom_up_ptr_begin() const {
+      return PerPtrBottomUp.begin();
+    }
+    const_bottom_up_ptr_iterator bottom_up_ptr_end() const {
       return PerPtrBottomUp.end();
     }
+    bool hasBottomUpPtrs() const {
+      return !PerPtrBottomUp.empty();
+    }
 
     /// Mark this block as being an entry block, which has one path from the
     /// entry by definition.
@@ -621,20 +251,20 @@ namespace {
     /// Attempt to find the PtrState object describing the top down state for
     /// pointer Arg. Return a new initialized PtrState describing the top down
     /// state for Arg if we do not find one.
-    PtrState &getPtrTopDownState(const Value *Arg) {
+    TopDownPtrState &getPtrTopDownState(const Value *Arg) {
       return PerPtrTopDown[Arg];
     }
 
     /// Attempt to find the PtrState object describing the bottom up state for
     /// pointer Arg. Return a new initialized PtrState describing the bottom up
     /// state for Arg if we do not find one.
-    PtrState &getPtrBottomUpState(const Value *Arg) {
+    BottomUpPtrState &getPtrBottomUpState(const Value *Arg) {
       return PerPtrBottomUp[Arg];
     }
 
     /// Attempt to find the PtrState object describing the bottom up state for
     /// pointer Arg.
-    ptr_iterator findPtrBottomUpState(const Value *Arg) {
+    bottom_up_ptr_iterator findPtrBottomUpState(const Value *Arg) {
       return PerPtrBottomUp.find(Arg);
     }
 
@@ -685,6 +315,11 @@ namespace {
   const unsigned BBState::OverflowOccurredValue = 0xffffffff;
 }
 
+namespace llvm {
+raw_ostream &operator<<(raw_ostream &OS,
+                        BBState &BBState) LLVM_ATTRIBUTE_UNUSED;
+}
+
 void BBState::InitFromPred(const BBState &Other) {
   PerPtrTopDown = Other.PerPtrTopDown;
   TopDownPathCount = Other.TopDownPathCount;
@@ -724,19 +359,18 @@ void BBState::MergePred(const BBState &Other) {
   // For each entry in the other set, if our set has an entry with the same key,
   // merge the entries. Otherwise, copy the entry and merge it with an empty
   // entry.
-  for (ptr_const_iterator MI = Other.top_down_ptr_begin(),
-       ME = Other.top_down_ptr_end(); MI != ME; ++MI) {
-    std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI);
-    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+  for (auto MI = Other.top_down_ptr_begin(), ME = Other.top_down_ptr_end();
+       MI != ME; ++MI) {
+    auto Pair = PerPtrTopDown.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? TopDownPtrState() : MI->second,
                              /*TopDown=*/true);
   }
 
   // For each entry in our set, if the other set doesn't have an entry with the
   // same key, force it to merge with an empty entry.
-  for (ptr_iterator MI = top_down_ptr_begin(),
-       ME = top_down_ptr_end(); MI != ME; ++MI)
+  for (auto MI = top_down_ptr_begin(), ME = top_down_ptr_end(); MI != ME; ++MI)
     if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
-      MI->second.Merge(PtrState(), /*TopDown=*/true);
+      MI->second.Merge(TopDownPtrState(), /*TopDown=*/true);
 }
 
 /// The bottom-up traversal uses this to merge information about successors to
@@ -768,304 +402,80 @@ void BBState::MergeSucc(const BBState &Other) {
   // For each entry in the other set, if our set has an entry with the
   // same key, merge the entries. Otherwise, copy the entry and merge
   // it with an empty entry.
-  for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(),
-       ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) {
-    std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI);
-    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+  for (auto MI = Other.bottom_up_ptr_begin(), ME = Other.bottom_up_ptr_end();
+       MI != ME; ++MI) {
+    auto Pair = PerPtrBottomUp.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? BottomUpPtrState() : MI->second,
                              /*TopDown=*/false);
   }
 
   // For each entry in our set, if the other set doesn't have an entry
   // with the same key, force it to merge with an empty entry.
-  for (ptr_iterator MI = bottom_up_ptr_begin(),
-       ME = bottom_up_ptr_end(); MI != ME; ++MI)
+  for (auto MI = bottom_up_ptr_begin(), ME = bottom_up_ptr_end(); MI != ME;
+       ++MI)
     if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
-      MI->second.Merge(PtrState(), /*TopDown=*/false);
+      MI->second.Merge(BottomUpPtrState(), /*TopDown=*/false);
 }
 
-// Only enable ARC Annotations if we are building a debug version of
-// libObjCARCOpts.
-#ifndef NDEBUG
-#define ARC_ANNOTATIONS
-#endif
-
-// Define some macros along the lines of DEBUG and some helper functions to make
-// it cleaner to create annotations in the source code and to no-op when not
-// building in debug mode.
-#ifdef ARC_ANNOTATIONS
-
-#include "llvm/Support/CommandLine.h"
-
-/// Enable/disable ARC sequence annotations.
-static cl::opt<bool>
-EnableARCAnnotations("enable-objc-arc-annotations", cl::init(false),
-                     cl::desc("Enable emission of arc data flow analysis "
-                              "annotations"));
-static cl::opt<bool>
-DisableCheckForCFGHazards("disable-objc-arc-checkforcfghazards", cl::init(false),
-                          cl::desc("Disable check for cfg hazards when "
-                                   "annotating"));
-static cl::opt<std::string>
-ARCAnnotationTargetIdentifier("objc-arc-annotation-target-identifier",
-                              cl::init(""),
-                              cl::desc("filter out all data flow annotations "
-                                       "but those that apply to the given "
-                                       "target llvm identifier."));
-
-/// This function appends a unique ARCAnnotationProvenanceSourceMDKind id to an
-/// instruction so that we can track backwards when post processing via the llvm
-/// arc annotation processor tool. If the function is an
-static MDString *AppendMDNodeToSourcePtr(unsigned NodeId,
-                                         Value *Ptr) {
-  MDString *Hash = nullptr;
-
-  // If pointer is a result of an instruction and it does not have a source
-  // MDNode it, attach a new MDNode onto it. If pointer is a result of
-  // an instruction and does have a source MDNode attached to it, return a
-  // reference to said Node. Otherwise just return 0.
-  if (Instruction *Inst = dyn_cast<Instruction>(Ptr)) {
-    MDNode *Node;
-    if (!(Node = Inst->getMetadata(NodeId))) {
-      // We do not have any node. Generate and attatch the hash MDString to the
-      // instruction.
-
-      // We just use an MDString to ensure that this metadata gets written out
-      // of line at the module level and to provide a very simple format
-      // encoding the information herein. Both of these makes it simpler to
-      // parse the annotations by a simple external program.
-      std::string Str;
-      raw_string_ostream os(Str);
-      os << "(" << Inst->getParent()->getParent()->getName() << ",%"
-         << Inst->getName() << ")";
-
-      Hash = MDString::get(Inst->getContext(), os.str());
-      Inst->setMetadata(NodeId, MDNode::get(Inst->getContext(),Hash));
-    } else {
-      // We have a node. Grab its hash and return it.
-      assert(Node->getNumOperands() == 1 &&
-        "An ARCAnnotationProvenanceSourceMDKind can only have 1 operand.");
-      Hash = cast<MDString>(Node->getOperand(0));
+raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
+  // Dump the pointers we are tracking.
+  OS << "    TopDown State:\n";
+  if (!BBInfo.hasTopDownPtrs()) {
+    DEBUG(llvm::dbgs() << "        NONE!\n");
+  } else {
+    for (auto I = BBInfo.top_down_ptr_begin(), E = BBInfo.top_down_ptr_end();
+         I != E; ++I) {
+      const PtrState &P = I->second;
+      OS << "        Ptr: " << *I->first
+         << "\n            KnownSafe:        " << (P.IsKnownSafe()?"true":"false")
+         << "\n            ImpreciseRelease: "
+           << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
+         << "            HasCFGHazards:    "
+           << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
+         << "            KnownPositive:    "
+           << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
+         << "            Seq:              "
+         << P.GetSeq() << "\n";
     }
-  } else if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
-    std::string str;
-    raw_string_ostream os(str);
-    os << "(" << Arg->getParent()->getName() << ",%" << Arg->getName()
-       << ")";
-    Hash = MDString::get(Arg->getContext(), os.str());
-  }
-
-  return Hash;
-}
-
-static std::string SequenceToString(Sequence A) {
-  std::string str;
-  raw_string_ostream os(str);
-  os << A;
-  return os.str();
-}
-
-/// Helper function to change a Sequence into a String object using our overload
-/// for raw_ostream so we only have printing code in one location.
-static MDString *SequenceToMDString(LLVMContext &Context,
-                                    Sequence A) {
-  return MDString::get(Context, SequenceToString(A));
-}
-
-/// A simple function to generate a MDNode which describes the change in state
-/// for Value *Ptr caused by Instruction *Inst.
-static void AppendMDNodeToInstForPtr(unsigned NodeId,
-                                     Instruction *Inst,
-                                     Value *Ptr,
-                                     MDString *PtrSourceMDNodeID,
-                                     Sequence OldSeq,
-                                     Sequence NewSeq) {
-  MDNode *Node = nullptr;
-  Metadata *tmp[3] = {PtrSourceMDNodeID,
-                      SequenceToMDString(Inst->getContext(), OldSeq),
-                      SequenceToMDString(Inst->getContext(), NewSeq)};
-  Node = MDNode::get(Inst->getContext(), tmp);
-
-  Inst->setMetadata(NodeId, Node);
-}
-
-/// Add to the beginning of the basic block llvm.ptr.annotations which show the
-/// state of a pointer at the entrance to a basic block.
-static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB,
-                                            Value *Ptr, Sequence Seq) {
-  // If we have a target identifier, make sure that we match it before
-  // continuing.
-  if(!ARCAnnotationTargetIdentifier.empty() &&
-     !Ptr->getName().equals(ARCAnnotationTargetIdentifier))
-    return;
-
-  Module *M = BB->getParent()->getParent();
-  LLVMContext &C = M->getContext();
-  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-  Type *I8XX = PointerType::getUnqual(I8X);
-  Type *Params[] = {I8XX, I8XX};
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C), Params,
-                                        /*isVarArg=*/false);
-  Constant *Callee = M->getOrInsertFunction(Name, FTy);
-
-  IRBuilder<> Builder(BB, BB->getFirstInsertionPt());
-
-  Value *PtrName;
-  StringRef Tmp = Ptr->getName();
-  if (nullptr == (PtrName = M->getGlobalVariable(Tmp, true))) {
-    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
-                                                         Tmp + "_STR");
-    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
-                                 cast<Constant>(ActualPtrName), Tmp);
-  }
-
-  Value *S;
-  std::string SeqStr = SequenceToString(Seq);
-  if (nullptr == (S = M->getGlobalVariable(SeqStr, true))) {
-    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
-                                                         SeqStr + "_STR");
-    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
-                           cast<Constant>(ActualPtrName), SeqStr);
-  }
-
-  Builder.CreateCall2(Callee, PtrName, S);
-}
-
-/// Add to the end of the basic block llvm.ptr.annotations which show the state
-/// of the pointer at the bottom of the basic block.
-static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB,
-                                              Value *Ptr, Sequence Seq) {
-  // If we have a target identifier, make sure that we match it before emitting
-  // an annotation.
-  if(!ARCAnnotationTargetIdentifier.empty() &&
-     !Ptr->getName().equals(ARCAnnotationTargetIdentifier))
-    return;
-
-  Module *M = BB->getParent()->getParent();
-  LLVMContext &C = M->getContext();
-  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-  Type *I8XX = PointerType::getUnqual(I8X);
-  Type *Params[] = {I8XX, I8XX};
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C), Params,
-                                        /*isVarArg=*/false);
-  Constant *Callee = M->getOrInsertFunction(Name, FTy);
-
-  IRBuilder<> Builder(BB, std::prev(BB->end()));
-
-  Value *PtrName;
-  StringRef Tmp = Ptr->getName();
-  if (nullptr == (PtrName = M->getGlobalVariable(Tmp, true))) {
-    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
-                                                         Tmp + "_STR");
-    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
-                                 cast<Constant>(ActualPtrName), Tmp);
   }
 
-  Value *S;
-  std::string SeqStr = SequenceToString(Seq);
-  if (nullptr == (S = M->getGlobalVariable(SeqStr, true))) {
-    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
-                                                         SeqStr + "_STR");
-    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
-                           cast<Constant>(ActualPtrName), SeqStr);
+  OS << "    BottomUp State:\n";
+  if (!BBInfo.hasBottomUpPtrs()) {
+    DEBUG(llvm::dbgs() << "        NONE!\n");
+  } else {
+    for (auto I = BBInfo.bottom_up_ptr_begin(), E = BBInfo.bottom_up_ptr_end();
+         I != E; ++I) {
+      const PtrState &P = I->second;
+      OS << "        Ptr: " << *I->first
+         << "\n            KnownSafe:        " << (P.IsKnownSafe()?"true":"false")
+         << "\n            ImpreciseRelease: "
+           << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
+         << "            HasCFGHazards:    "
+           << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
+         << "            KnownPositive:    "
+           << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
+         << "            Seq:              "
+         << P.GetSeq() << "\n";
+    }
   }
-  Builder.CreateCall2(Callee, PtrName, S);
-}
 
-/// Adds a source annotation to pointer and a state change annotation to Inst
-/// referencing the source annotation and the old/new state of pointer.
-static void GenerateARCAnnotation(unsigned InstMDId,
-                                  unsigned PtrMDId,
-                                  Instruction *Inst,
-                                  Value *Ptr,
-                                  Sequence OldSeq,
-                                  Sequence NewSeq) {
-  if (EnableARCAnnotations) {
-    // If we have a target identifier, make sure that we match it before
-    // emitting an annotation.
-    if(!ARCAnnotationTargetIdentifier.empty() &&
-       !Ptr->getName().equals(ARCAnnotationTargetIdentifier))
-      return;
-
-    // First generate the source annotation on our pointer. This will return an
-    // MDString* if Ptr actually comes from an instruction implying we can put
-    // in a source annotation. If AppendMDNodeToSourcePtr returns 0 (i.e. NULL),
-    // then we know that our pointer is from an Argument so we put a reference
-    // to the argument number.
-    //
-    // The point of this is to make it easy for the
-    // llvm-arc-annotation-processor tool to cross reference where the source
-    // pointer is in the LLVM IR since the LLVM IR parser does not submit such
-    // information via debug info for backends to use (since why would anyone
-    // need such a thing from LLVM IR besides in non-standard cases
-    // [i.e. this]).
-    MDString *SourcePtrMDNode =
-      AppendMDNodeToSourcePtr(PtrMDId, Ptr);
-    AppendMDNodeToInstForPtr(InstMDId, Inst, Ptr, SourcePtrMDNode, OldSeq,
-                             NewSeq);
-  }
+  return OS;
 }
 
-// The actual interface for accessing the above functionality is defined via
-// some simple macros which are defined below. We do this so that the user does
-// not need to pass in what metadata id is needed resulting in cleaner code and
-// additionally since it provides an easy way to conditionally no-op all
-// annotation support in a non-debug build.
-
-/// Use this macro to annotate a sequence state change when processing
-/// instructions bottom up,
-#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)                          \
-  GenerateARCAnnotation(ARCAnnotationBottomUpMDKind,                    \
-                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
-                        const_cast<Value*>(ptr), (old), (new))
-/// Use this macro to annotate a sequence state change when processing
-/// instructions top down.
-#define ANNOTATE_TOPDOWN(inst, ptr, old, new)                           \
-  GenerateARCAnnotation(ARCAnnotationTopDownMDKind,                     \
-                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
-                        const_cast<Value*>(ptr), (old), (new))
-
-#define ANNOTATE_BB(_states, _bb, _name, _type, _direction)                   \
-  do {                                                                        \
-    if (EnableARCAnnotations) {                                               \
-      for(BBState::ptr_const_iterator I = (_states)._direction##_ptr_begin(), \
-          E = (_states)._direction##_ptr_end(); I != E; ++I) {                \
-        Value *Ptr = const_cast<Value*>(I->first);                            \
-        Sequence Seq = I->second.GetSeq();                                    \
-        GenerateARCBB ## _type ## Annotation(_name, (_bb), Ptr, Seq);         \
-      }                                                                       \
-    }                                                                         \
-  } while (0)
-
-#define ANNOTATE_BOTTOMUP_BBSTART(_states, _basicblock)                       \
-    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbstart", \
-                Entrance, bottom_up)
-#define ANNOTATE_BOTTOMUP_BBEND(_states, _basicblock)                         \
-    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbend",   \
-                Terminator, bottom_up)
-#define ANNOTATE_TOPDOWN_BBSTART(_states, _basicblock)                        \
-    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbstart",  \
-                Entrance, top_down)
-#define ANNOTATE_TOPDOWN_BBEND(_states, _basicblock)                          \
-    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbend",    \
-                Terminator, top_down)
-
-#else // !ARC_ANNOTATION
-// If annotations are off, noop.
-#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)
-#define ANNOTATE_TOPDOWN(inst, ptr, old, new)
-#define ANNOTATE_BOTTOMUP_BBSTART(states, basicblock)
-#define ANNOTATE_BOTTOMUP_BBEND(states, basicblock)
-#define ANNOTATE_TOPDOWN_BBSTART(states, basicblock)
-#define ANNOTATE_TOPDOWN_BBEND(states, basicblock)
-#endif // !ARC_ANNOTATION
-
 namespace {
+
   /// \brief The main ARC optimization pass.
   class ObjCARCOpt : public FunctionPass {
     bool Changed;
     ProvenanceAnalysis PA;
+
+    /// A cache of references to runtime entry point constants.
     ARCRuntimeEntryPoints EP;
 
+    /// A cache of MDKinds that can be passed into other functions to propagate
+    /// MDKind identifiers.
+    ARCMDKindCache MDKindCache;
+
     // This is used to track if a pointer is stored into an alloca.
     DenseSet<const Value *> MultiOwnersSet;
 
@@ -1076,24 +486,6 @@ namespace {
     /// is in fact used in the current function.
     unsigned UsedInThisFunction;
 
-    /// The Metadata Kind for clang.imprecise_release metadata.
-    unsigned ImpreciseReleaseMDKind;
-
-    /// The Metadata Kind for clang.arc.copy_on_escape metadata.
-    unsigned CopyOnEscapeMDKind;
-
-    /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata.
-    unsigned NoObjCARCExceptionsMDKind;
-
-#ifdef ARC_ANNOTATIONS
-    /// The Metadata Kind for llvm.arc.annotation.bottomup metadata.
-    unsigned ARCAnnotationBottomUpMDKind;
-    /// The Metadata Kind for llvm.arc.annotation.topdown metadata.
-    unsigned ARCAnnotationTopDownMDKind;
-    /// The Metadata Kind for llvm.arc.annotation.provenancesource metadata.
-    unsigned ARCAnnotationProvenanceSourceMDKind;
-#endif // ARC_ANNOATIONS
-
     bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
     void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
                                    ARCInstKind &Class);
@@ -1102,47 +494,41 @@ namespace {
     void CheckForCFGHazards(const BasicBlock *BB,
                             DenseMap<const BasicBlock *, BBState> &BBStates,
                             BBState &MyStates) const;
-    bool VisitInstructionBottomUp(Instruction *Inst,
-                                  BasicBlock *BB,
-                                  MapVector<Value *, RRInfo> &Retains,
+    bool VisitInstructionBottomUp(Instruction *Inst, BasicBlock *BB,
+                                  BlotMapVector<Value *, RRInfo> &Retains,
                                   BBState &MyStates);
     bool VisitBottomUp(BasicBlock *BB,
                        DenseMap<const BasicBlock *, BBState> &BBStates,
-                       MapVector<Value *, RRInfo> &Retains);
+                       BlotMapVector<Value *, RRInfo> &Retains);
     bool VisitInstructionTopDown(Instruction *Inst,
                                  DenseMap<Value *, RRInfo> &Releases,
                                  BBState &MyStates);
     bool VisitTopDown(BasicBlock *BB,
                       DenseMap<const BasicBlock *, BBState> &BBStates,
                       DenseMap<Value *, RRInfo> &Releases);
-    bool Visit(Function &F,
-               DenseMap<const BasicBlock *, BBState> &BBStates,
-               MapVector<Value *, RRInfo> &Retains,
+    bool Visit(Function &F, DenseMap<const BasicBlock *, BBState> &BBStates,
+               BlotMapVector<Value *, RRInfo> &Retains,
                DenseMap<Value *, RRInfo> &Releases);
 
     void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
-                   MapVector<Value *, RRInfo> &Retains,
+                   BlotMapVector<Value *, RRInfo> &Retains,
                    DenseMap<Value *, RRInfo> &Releases,
-                   SmallVectorImpl<Instruction *> &DeadInsts,
-                   Module *M);
-
-    bool ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> &BBStates,
-                               MapVector<Value *, RRInfo> &Retains,
-                               DenseMap<Value *, RRInfo> &Releases,
-                               Module *M,
-                               SmallVectorImpl<Instruction *> &NewRetains,
-                               SmallVectorImpl<Instruction *> &NewReleases,
-                               SmallVectorImpl<Instruction *> &DeadInsts,
-                               RRInfo &RetainsToMove,
-                               RRInfo &ReleasesToMove,
-                               Value *Arg,
-                               bool KnownSafe,
-                               bool &AnyPairsCompletelyEliminated);
+                   SmallVectorImpl<Instruction *> &DeadInsts, Module *M);
+
+    bool
+    PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates,
+                             BlotMapVector<Value *, RRInfo> &Retains,
+                             DenseMap<Value *, RRInfo> &Releases, Module *M,
+                             SmallVectorImpl<Instruction *> &NewRetains,
+                             SmallVectorImpl<Instruction *> &NewReleases,
+                             SmallVectorImpl<Instruction *> &DeadInsts,
+                             RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                             Value *Arg, bool KnownSafe,
+                             bool &AnyPairsCompletelyEliminated);
 
     bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
-                              MapVector<Value *, RRInfo> &Retains,
-                              DenseMap<Value *, RRInfo> &Releases,
-                              Module *M);
+                              BlotMapVector<Value *, RRInfo> &Retains,
+                              DenseMap<Value *, RRInfo> &Releases, Module *M);
 
     void OptimizeWeakCalls(Function &F);
 
@@ -1238,7 +624,7 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
                   "objc_retain since the operand is not a return value.\n"
                   "Old = " << *RetainRV << "\n");
 
-  Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
+  Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Retain);
   cast<CallInst>(RetainRV)->setCalledFunction(NewDecl);
 
   DEBUG(dbgs() << "New = " << *RetainRV << "\n");
@@ -1274,7 +660,7 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
                   "Old = " << *AutoreleaseRV << "\n");
 
   CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
-  Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Autorelease);
+  Constant *NewDecl = EP.get(ARCRuntimeEntryPointKind::Autorelease);
   AutoreleaseRVCI->setCalledFunction(NewDecl);
   AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
   Class = ARCInstKind::Autorelease;
@@ -1380,10 +766,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
         // Create the declaration lazily.
         LLVMContext &C = Inst->getContext();
 
-        Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Release);
+        Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
         CallInst *NewCall = CallInst::Create(Decl, Call->getArgOperand(0), "",
                                              Call);
-        NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, None));
+        NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease),
+                             MDNode::get(C, None));
 
         DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
               "since x is otherwise unused.\nOld: " << *Call << "\nNew: "
@@ -1547,7 +934,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
 /// no CFG hazards by checking the states of various bottom up pointers.
 static void CheckForUseCFGHazard(const Sequence SuccSSeq,
                                  const bool SuccSRRIKnownSafe,
-                                 PtrState &S,
+                                 TopDownPtrState &S,
                                  bool &SomeSuccHasSame,
                                  bool &AllSuccsHaveSame,
                                  bool &NotAllSeqEqualButKnownSafe,
@@ -1585,7 +972,7 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq,
 /// pointers.
 static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
                                         const bool SuccSRRIKnownSafe,
-                                        PtrState &S,
+                                        TopDownPtrState &S,
                                         bool &SomeSuccHasSame,
                                         bool &AllSuccsHaveSame,
                                         bool &NotAllSeqEqualButKnownSafe) {
@@ -1618,9 +1005,9 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
                                BBState &MyStates) const {
   // If any top-down local-use or possible-dec has a succ which is earlier in
   // the sequence, forget it.
-  for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
-         E = MyStates.top_down_ptr_end(); I != E; ++I) {
-    PtrState &S = I->second;
+  for (auto I = MyStates.top_down_ptr_begin(), E = MyStates.top_down_ptr_end();
+       I != E; ++I) {
+    TopDownPtrState &S = I->second;
     const Sequence Seq = I->second.GetSeq();
 
     // We only care about S_Retain, S_CanRelease, and S_Use.
@@ -1646,7 +1033,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       const DenseMap<const BasicBlock *, BBState>::iterator BBI =
         BBStates.find(*SI);
       assert(BBI != BBStates.end());
-      const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+      const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
       const Sequence SuccSSeq = SuccS.GetSeq();
 
       // If bottom up, the pointer is in an S_None state, clear the sequence
@@ -1705,44 +1092,21 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
   }
 }
 
-bool
-ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
-                                     BasicBlock *BB,
-                                     MapVector<Value *, RRInfo> &Retains,
-                                     BBState &MyStates) {
+bool ObjCARCOpt::VisitInstructionBottomUp(
+    Instruction *Inst, BasicBlock *BB, BlotMapVector<Value *, RRInfo> &Retains,
+    BBState &MyStates) {
   bool NestingDetected = false;
   ARCInstKind Class = GetARCInstKind(Inst);
   const Value *Arg = nullptr;
 
-  DEBUG(dbgs() << "Class: " << Class << "\n");
+  DEBUG(dbgs() << "        Class: " << Class << "\n");
 
   switch (Class) {
   case ARCInstKind::Release: {
     Arg = GetArgRCIdentityRoot(Inst);
 
-    PtrState &S = MyStates.getPtrBottomUpState(Arg);
-
-    // If we see two releases in a row on the same pointer. If so, make
-    // a note, and we'll cicle back to revisit it after we've
-    // hopefully eliminated the second release, which may allow us to
-    // eliminate the first release too.
-    // Theoretically we could implement removal of nested retain+release
-    // pairs by making PtrState hold a stack of states, but this is
-    // simple and avoids adding overhead for the non-nested case.
-    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) {
-      DEBUG(dbgs() << "Found nested releases (i.e. a release pair)\n");
-      NestingDetected = true;
-    }
-
-    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-    Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
-    ANNOTATE_BOTTOMUP(Inst, Arg, S.GetSeq(), NewSeq);
-    S.ResetSequenceProgress(NewSeq);
-    S.SetReleaseMetadata(ReleaseMetadata);
-    S.SetKnownSafe(S.HasKnownPositiveRefCount());
-    S.SetTailCallRelease(cast<CallInst>(Inst)->isTailCall());
-    S.InsertCall(Inst);
-    S.SetKnownPositiveRefCount();
+    BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
+    NestingDetected |= S.InitBottomUp(MDKindCache, Inst);
     break;
   }
   case ARCInstKind::RetainBlock:
@@ -1753,35 +1117,16 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV: {
     Arg = GetArgRCIdentityRoot(Inst);
-
-    PtrState &S = MyStates.getPtrBottomUpState(Arg);
-    S.SetKnownPositiveRefCount();
-
-    Sequence OldSeq = S.GetSeq();
-    switch (OldSeq) {
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-    case S_Use:
-      // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an
-      // imprecise release, clear our reverse insertion points.
-      if (OldSeq != S_Use || S.IsTrackingImpreciseReleases())
-        S.ClearReverseInsertPts();
-      // FALL THROUGH
-    case S_CanRelease:
-      // Don't do retain+release tracking for ARCInstKind::RetainRV,
-      // because it's
-      // better to let it remain as the first instruction after a call.
-      if (Class != ARCInstKind::RetainRV)
+    BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
+    if (S.MatchWithRetain()) {
+      // Don't do retain+release tracking for ARCInstKind::RetainRV, because
+      // it's better to let it remain as the first instruction after a call.
+      if (Class != ARCInstKind::RetainRV) {
+        DEBUG(llvm::dbgs() << "        Matching with: " << *Inst << "\n");
         Retains[Inst] = S.GetRRInfo();
+      }
       S.ClearSequenceProgress();
-      break;
-    case S_None:
-      break;
-    case S_Retain:
-      llvm_unreachable("bottom-up pointer in retain state!");
     }
-    ANNOTATE_BOTTOMUP(Inst, Arg, OldSeq, S.GetSeq());
     // A retain moving bottom up can be a use.
     break;
   }
@@ -1807,9 +1152,10 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     // in the presence of allocas we only unconditionally remove pointers if
     // both our retain and our release are KnownSafe.
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand())) {
-        BBState::ptr_iterator I = MyStates.findPtrBottomUpState(
-          GetRCIdentityRoot(SI->getValueOperand()));
+      const DataLayout &DL = BB->getModule()->getDataLayout();
+      if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand(), DL)) {
+        auto I = MyStates.findPtrBottomUpState(
+            GetRCIdentityRoot(SI->getValueOperand()));
         if (I != MyStates.bottom_up_ptr_end())
           MultiOwnersSet.insert(I->first);
       }
@@ -1821,90 +1167,26 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
 
   // Consider any other possible effects of this instruction on each
   // pointer being tracked.
-  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
-       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
+  for (auto MI = MyStates.bottom_up_ptr_begin(),
+            ME = MyStates.bottom_up_ptr_end();
+       MI != ME; ++MI) {
     const Value *Ptr = MI->first;
     if (Ptr == Arg)
       continue; // Handled above.
-    PtrState &S = MI->second;
-    Sequence Seq = S.GetSeq();
+    BottomUpPtrState &S = MI->second;
 
-    // Check for possible releases.
-    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      DEBUG(dbgs() << "CanAlterRefCount: Seq: " << Seq << "; " << *Ptr
-            << "\n");
-      S.ClearKnownPositiveRefCount();
-      switch (Seq) {
-      case S_Use:
-        S.SetSeq(S_CanRelease);
-        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S.GetSeq());
-        continue;
-      case S_CanRelease:
-      case S_Release:
-      case S_MovableRelease:
-      case S_Stop:
-      case S_None:
-        break;
-      case S_Retain:
-        llvm_unreachable("bottom-up pointer in retain state!");
-      }
-    }
+    if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
+      continue;
 
-    // Check for possible direct uses.
-    switch (Seq) {
-    case S_Release:
-    case S_MovableRelease:
-      if (CanUse(Inst, Ptr, PA, Class)) {
-        DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr
-              << "\n");
-        assert(!S.HasReverseInsertPts());
-        // If this is an invoke instruction, we're scanning it as part of
-        // one of its successor blocks, since we can't insert code after it
-        // in its own block, and we don't want to split critical edges.
-        if (isa<InvokeInst>(Inst))
-          S.InsertReverseInsertPt(BB->getFirstInsertionPt());
-        else
-          S.InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst)));
-        S.SetSeq(S_Use);
-        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
-      } else if (Seq == S_Release && IsUser(Class)) {
-        DEBUG(dbgs() << "PreciseReleaseUse: Seq: " << Seq << "; " << *Ptr
-              << "\n");
-        // Non-movable releases depend on any possible objc pointer use.
-        S.SetSeq(S_Stop);
-        ANNOTATE_BOTTOMUP(Inst, Ptr, S_Release, S_Stop);
-        assert(!S.HasReverseInsertPts());
-        // As above; handle invoke specially.
-        if (isa<InvokeInst>(Inst))
-          S.InsertReverseInsertPt(BB->getFirstInsertionPt());
-        else
-          S.InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst)));
-      }
-      break;
-    case S_Stop:
-      if (CanUse(Inst, Ptr, PA, Class)) {
-        DEBUG(dbgs() << "PreciseStopUse: Seq: " << Seq << "; " << *Ptr
-              << "\n");
-        S.SetSeq(S_Use);
-        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
-      }
-      break;
-    case S_CanRelease:
-    case S_Use:
-    case S_None:
-      break;
-    case S_Retain:
-      llvm_unreachable("bottom-up pointer in retain state!");
-    }
+    S.HandlePotentialUse(BB, Inst, Ptr, PA, Class);
   }
 
   return NestingDetected;
 }
 
-bool
-ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
-                          DenseMap<const BasicBlock *, BBState> &BBStates,
-                          MapVector<Value *, RRInfo> &Retains) {
+bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BlotMapVector<Value *, RRInfo> &Retains) {
 
   DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
 
@@ -1929,9 +1211,8 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     }
   }
 
-  // If ARC Annotations are enabled, output the current state of pointers at the
-  // bottom of the basic block.
-  ANNOTATE_BOTTOMUP_BBEND(MyStates, BB);
+  DEBUG(llvm::dbgs() << "Before:\n" << BBStates[BB] << "\n"
+                     << "Performing Dataflow:\n");
 
   // Visit all the instructions, bottom-up.
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
@@ -1941,7 +1222,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     if (isa<InvokeInst>(Inst))
       continue;
 
-    DEBUG(dbgs() << "Visiting " << *Inst << "\n");
+    DEBUG(dbgs() << "    Visiting " << *Inst << "\n");
 
     NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
   }
@@ -1956,9 +1237,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
   }
 
-  // If ARC Annotations are enabled, output the current state of pointers at the
-  // top of the basic block.
-  ANNOTATE_BOTTOMUP_BBSTART(MyStates, BB);
+  DEBUG(llvm::dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n");
 
   return NestingDetected;
 }
@@ -1971,144 +1250,63 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
   ARCInstKind Class = GetARCInstKind(Inst);
   const Value *Arg = nullptr;
 
+  DEBUG(llvm::dbgs() << "        Class: " << Class << "\n");
+
   switch (Class) {
   case ARCInstKind::RetainBlock:
     // In OptimizeIndividualCalls, we have strength reduced all optimizable
     // objc_retainBlocks to objc_retains. Thus at this point any
-    // objc_retainBlocks that we see are not optimizable.
+    // objc_retainBlocks that we see are not optimizable. We need to break since
+    // a retain can be a potential use.
     break;
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV: {
     Arg = GetArgRCIdentityRoot(Inst);
-
-    PtrState &S = MyStates.getPtrTopDownState(Arg);
-
-    // Don't do retain+release tracking for ARCInstKind::RetainRV, because
-    // it's
-    // better to let it remain as the first instruction after a call.
-    if (Class != ARCInstKind::RetainRV) {
-      // If we see two retains in a row on the same pointer. If so, make
-      // a note, and we'll cicle back to revisit it after we've
-      // hopefully eliminated the second retain, which may allow us to
-      // eliminate the first retain too.
-      // Theoretically we could implement removal of nested retain+release
-      // pairs by making PtrState hold a stack of states, but this is
-      // simple and avoids adding overhead for the non-nested case.
-      if (S.GetSeq() == S_Retain)
-        NestingDetected = true;
-
-      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_Retain);
-      S.ResetSequenceProgress(S_Retain);
-      S.SetKnownSafe(S.HasKnownPositiveRefCount());
-      S.InsertCall(Inst);
-    }
-
-    S.SetKnownPositiveRefCount();
-
+    TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
+    NestingDetected |= S.InitTopDown(Class, Inst);
     // A retain can be a potential use; procede to the generic checking
     // code below.
     break;
   }
   case ARCInstKind::Release: {
     Arg = GetArgRCIdentityRoot(Inst);
-
-    PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.ClearKnownPositiveRefCount();
-
-    Sequence OldSeq = S.GetSeq();
-
-    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-
-    switch (OldSeq) {
-    case S_Retain:
-    case S_CanRelease:
-      if (OldSeq == S_Retain || ReleaseMetadata != nullptr)
-        S.ClearReverseInsertPts();
-      // FALL THROUGH
-    case S_Use:
-      S.SetReleaseMetadata(ReleaseMetadata);
-      S.SetTailCallRelease(cast<CallInst>(Inst)->isTailCall());
+    TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
+    // Try to form a tentative pair in between this release instruction and the
+    // top down pointers that we are tracking.
+    if (S.MatchWithRelease(MDKindCache, Inst)) {
+      // If we succeed, copy S's RRInfo into the Release -> {Retain Set
+      // Map}. Then we clear S.
+      DEBUG(llvm::dbgs() << "        Matching with: " << *Inst << "\n");
       Releases[Inst] = S.GetRRInfo();
-      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_None);
       S.ClearSequenceProgress();
-      break;
-    case S_None:
-      break;
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-      llvm_unreachable("top-down pointer in release state!");
     }
     break;
   }
   case ARCInstKind::AutoreleasepoolPop:
     // Conservatively, clear MyStates for all known pointers.
     MyStates.clearTopDownPointers();
-    return NestingDetected;
+    return false;
   case ARCInstKind::AutoreleasepoolPush:
   case ARCInstKind::None:
-    // These are irrelevant.
-    return NestingDetected;
+    // These can not be uses of
+    return false;
   default:
     break;
   }
 
   // Consider any other possible effects of this instruction on each
   // pointer being tracked.
-  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
-       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
+  for (auto MI = MyStates.top_down_ptr_begin(),
+            ME = MyStates.top_down_ptr_end();
+       MI != ME; ++MI) {
     const Value *Ptr = MI->first;
     if (Ptr == Arg)
       continue; // Handled above.
-    PtrState &S = MI->second;
-    Sequence Seq = S.GetSeq();
-
-    // Check for possible releases.
-    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      DEBUG(dbgs() << "CanAlterRefCount: Seq: " << Seq << "; " << *Ptr
-            << "\n");
-      S.ClearKnownPositiveRefCount();
-      switch (Seq) {
-      case S_Retain:
-        S.SetSeq(S_CanRelease);
-        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_CanRelease);
-        assert(!S.HasReverseInsertPts());
-        S.InsertReverseInsertPt(Inst);
-
-        // One call can't cause a transition from S_Retain to S_CanRelease
-        // and S_CanRelease to S_Use. If we've made the first transition,
-        // we're done.
-        continue;
-      case S_Use:
-      case S_CanRelease:
-      case S_None:
-        break;
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
-        llvm_unreachable("top-down pointer in release state!");
-      }
-    }
+    TopDownPtrState &S = MI->second;
+    if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
+      continue;
 
-    // Check for possible direct uses.
-    switch (Seq) {
-    case S_CanRelease:
-      if (CanUse(Inst, Ptr, PA, Class)) {
-        DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr
-              << "\n");
-        S.SetSeq(S_Use);
-        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_Use);
-      }
-      break;
-    case S_Retain:
-    case S_Use:
-    case S_None:
-      break;
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-      llvm_unreachable("top-down pointer in release state!");
-    }
+    S.HandlePotentialUse(Inst, Ptr, PA, Class);
   }
 
   return NestingDetected;
@@ -2140,27 +1338,22 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
     }
   }
 
-  // If ARC Annotations are enabled, output the current state of pointers at the
-  // top of the basic block.
-  ANNOTATE_TOPDOWN_BBSTART(MyStates, BB);
+  DEBUG(llvm::dbgs() << "Before:\n" << BBStates[BB]  << "\n"
+                     << "Performing Dataflow:\n");
 
   // Visit all the instructions, top-down.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     Instruction *Inst = I;
 
-    DEBUG(dbgs() << "Visiting " << *Inst << "\n");
+    DEBUG(dbgs() << "    Visiting " << *Inst << "\n");
 
     NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
   }
 
-  // If ARC Annotations are enabled, output the current state of pointers at the
-  // bottom of the basic block.
-  ANNOTATE_TOPDOWN_BBEND(MyStates, BB);
-
-#ifdef ARC_ANNOTATIONS
-  if (!(EnableARCAnnotations && DisableCheckForCFGHazards))
-#endif
+  DEBUG(llvm::dbgs() << "\nState Before Checking for CFG Hazards:\n"
+                     << BBStates[BB] << "\n\n");
   CheckForCFGHazards(BB, BBStates, MyStates);
+  DEBUG(llvm::dbgs() << "Final State:\n" << BBStates[BB] << "\n");
   return NestingDetected;
 }
 
@@ -2246,11 +1439,10 @@ ComputePostOrders(Function &F,
 }
 
 // Visit the function both top-down and bottom-up.
-bool
-ObjCARCOpt::Visit(Function &F,
-                  DenseMap<const BasicBlock *, BBState> &BBStates,
-                  MapVector<Value *, RRInfo> &Retains,
-                  DenseMap<Value *, RRInfo> &Releases) {
+bool ObjCARCOpt::Visit(Function &F,
+                       DenseMap<const BasicBlock *, BBState> &BBStates,
+                       BlotMapVector<Value *, RRInfo> &Retains,
+                       DenseMap<Value *, RRInfo> &Releases) {
 
   // Use reverse-postorder traversals, because we magically know that loops
   // will be well behaved, i.e. they won't repeatedly call retain on a single
@@ -2260,7 +1452,7 @@ ObjCARCOpt::Visit(Function &F,
   SmallVector<BasicBlock *, 16> PostOrder;
   SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
   ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
-                    NoObjCARCExceptionsMDKind,
+                    MDKindCache.get(ARCMDKindID::NoObjCARCExceptions),
                     BBStates);
 
   // Use reverse-postorder on the reverse CFG for bottom-up.
@@ -2281,10 +1473,9 @@ ObjCARCOpt::Visit(Function &F,
 }
 
 /// Move the calls in RetainsToMove and ReleasesToMove.
-void ObjCARCOpt::MoveCalls(Value *Arg,
-                           RRInfo &RetainsToMove,
+void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
                            RRInfo &ReleasesToMove,
-                           MapVector<Value *, RRInfo> &Retains,
+                           BlotMapVector<Value *, RRInfo> &Retains,
                            DenseMap<Value *, RRInfo> &Releases,
                            SmallVectorImpl<Instruction *> &DeadInsts,
                            Module *M) {
@@ -2297,7 +1488,7 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
   for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) {
     Value *MyArg = ArgTy == ParamTy ? Arg :
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
-    Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
+    Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
     CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
     Call->setDoesNotThrow();
     Call->setTailCall();
@@ -2308,11 +1499,11 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
   for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) {
     Value *MyArg = ArgTy == ParamTy ? Arg :
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
-    Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Release);
+    Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
     CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
     // Attach a clang.imprecise_release metadata tag, if appropriate.
     if (MDNode *M = ReleasesToMove.ReleaseMetadata)
-      Call->setMetadata(ImpreciseReleaseMDKind, M);
+      Call->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), M);
     Call->setDoesNotThrow();
     if (ReleasesToMove.IsTailCallRelease)
       Call->setTailCall();
@@ -2335,20 +1526,15 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
 
 }
 
-bool
-ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
-                                    &BBStates,
-                                  MapVector<Value *, RRInfo> &Retains,
-                                  DenseMap<Value *, RRInfo> &Releases,
-                                  Module *M,
-                                  SmallVectorImpl<Instruction *> &NewRetains,
-                                  SmallVectorImpl<Instruction *> &NewReleases,
-                                  SmallVectorImpl<Instruction *> &DeadInsts,
-                                  RRInfo &RetainsToMove,
-                                  RRInfo &ReleasesToMove,
-                                  Value *Arg,
-                                  bool KnownSafe,
-                                  bool &AnyPairsCompletelyEliminated) {
+bool ObjCARCOpt::PairUpRetainsAndReleases(
+    DenseMap<const BasicBlock *, BBState> &BBStates,
+    BlotMapVector<Value *, RRInfo> &Retains,
+    DenseMap<Value *, RRInfo> &Releases, Module *M,
+    SmallVectorImpl<Instruction *> &NewRetains,
+    SmallVectorImpl<Instruction *> &NewReleases,
+    SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove,
+    RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe,
+    bool &AnyPairsCompletelyEliminated) {
   // If a pair happens in a region where it is known that the reference count
   // is already incremented, we can similarly ignore possible decrements unless
   // we are dealing with a retainable object with multiple provenance sources.
@@ -2369,15 +1555,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
     for (SmallVectorImpl<Instruction *>::const_iterator
            NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
       Instruction *NewRetain = *NI;
-      MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
+      auto It = Retains.find(NewRetain);
       assert(It != Retains.end());
       const RRInfo &NewRetainRRI = It->second;
       KnownSafeTD &= NewRetainRRI.KnownSafe;
       MultipleOwners =
         MultipleOwners || MultiOwnersSet.count(GetArgRCIdentityRoot(NewRetain));
       for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
-        DenseMap<Value *, RRInfo>::const_iterator Jt =
-          Releases.find(NewRetainRelease);
+        auto Jt = Releases.find(NewRetainRelease);
         if (Jt == Releases.end())
           return false;
         const RRInfo &NewRetainReleaseRRI = Jt->second;
@@ -2446,15 +1631,13 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
     for (SmallVectorImpl<Instruction *>::const_iterator
            NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
       Instruction *NewRelease = *NI;
-      DenseMap<Value *, RRInfo>::const_iterator It =
-        Releases.find(NewRelease);
+      auto It = Releases.find(NewRelease);
       assert(It != Releases.end());
       const RRInfo &NewReleaseRRI = It->second;
       KnownSafeBU &= NewReleaseRRI.KnownSafe;
       CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted;
       for (Instruction *NewReleaseRetain : NewReleaseRRI.Calls) {
-        MapVector<Value *, RRInfo>::const_iterator Jt =
-          Retains.find(NewReleaseRetain);
+        auto Jt = Retains.find(NewReleaseRetain);
         if (Jt == Retains.end())
           return false;
         const RRInfo &NewReleaseRetainRRI = Jt->second;
@@ -2506,11 +1689,8 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
     if (NewRetains.empty()) break;
   }
 
-  // If the pointer is known incremented in 1 direction and we do not have
-  // MultipleOwners, we can safely remove the retain/releases. Otherwise we need
-  // to be known safe in both directions.
-  bool UnconditionallySafe = (KnownSafeTD && KnownSafeBU) ||
-    ((KnownSafeTD || KnownSafeBU) && !MultipleOwners);
+  // We can only remove pointers if we are known safe in both directions.
+  bool UnconditionallySafe = KnownSafeTD && KnownSafeBU;
   if (UnconditionallySafe) {
     RetainsToMove.ReverseInsertPts.clear();
     ReleasesToMove.ReverseInsertPts.clear();
@@ -2540,12 +1720,6 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
   if (OldDelta != 0)
     return false;
 
-#ifdef ARC_ANNOTATIONS
-  // Do not move calls if ARC annotations are requested.
-  if (EnableARCAnnotations)
-    return false;
-#endif // ARC_ANNOTATIONS
-
   Changed = true;
   assert(OldCount != 0 && "Unreachable code?");
   NumRRs += OldCount - NewCount;
@@ -2558,12 +1732,10 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
 
 /// Identify pairings between the retains and releases, and delete and/or move
 /// them.
-bool
-ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
-                                   &BBStates,
-                                 MapVector<Value *, RRInfo> &Retains,
-                                 DenseMap<Value *, RRInfo> &Releases,
-                                 Module *M) {
+bool ObjCARCOpt::PerformCodePlacement(
+    DenseMap<const BasicBlock *, BBState> &BBStates,
+    BlotMapVector<Value *, RRInfo> &Retains,
+    DenseMap<Value *, RRInfo> &Releases, Module *M) {
   DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
 
   bool AnyPairsCompletelyEliminated = false;
@@ -2574,8 +1746,9 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
   SmallVector<Instruction *, 8> DeadInsts;
 
   // Visit each retain.
-  for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
-       E = Retains.end(); I != E; ++I) {
+  for (BlotMapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
+                                                      E = Retains.end();
+       I != E; ++I) {
     Value *V = I->first;
     if (!V) continue; // blotted
 
@@ -2602,11 +1775,10 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
     // Connect the dots between the top-down-collected RetainsToMove and
     // bottom-up-collected ReleasesToMove to form sets of related calls.
     NewRetains.push_back(Retain);
-    bool PerformMoveCalls =
-      ConnectTDBUTraversals(BBStates, Retains, Releases, M, NewRetains,
-                            NewReleases, DeadInsts, RetainsToMove,
-                            ReleasesToMove, Arg, KnownSafe,
-                            AnyPairsCompletelyEliminated);
+    bool PerformMoveCalls = PairUpRetainsAndReleases(
+        BBStates, Retains, Releases, M, NewRetains, NewReleases, DeadInsts,
+        RetainsToMove, ReleasesToMove, Arg, KnownSafe,
+        AnyPairsCompletelyEliminated);
 
     if (PerformMoveCalls) {
       // Ok, everything checks out and we're all set. Let's move/delete some
@@ -2678,7 +1850,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
           Changed = true;
           // If the load has a builtin retain, insert a plain retain for it.
           if (Class == ARCInstKind::LoadWeakRetained) {
-            Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
+            Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
             CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
             CI->setTailCall();
           }
@@ -2707,7 +1879,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
           Changed = true;
           // If the load has a builtin retain, insert a plain retain for it.
           if (Class == ARCInstKind::LoadWeakRetained) {
-            Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
+            Constant *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
             CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
             CI->setTailCall();
           }
@@ -2795,7 +1967,7 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
   // map stays valid when we get around to rewriting code and calls get
   // replaced by arguments.
   DenseMap<Value *, RRInfo> Releases;
-  MapVector<Value *, RRInfo> Retains;
+  BlotMapVector<Value *, RRInfo> Retains;
 
   // This is used during the traversal of the function to track the
   // states for each identified object at each block.
@@ -2828,8 +2000,7 @@ HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
   if (DepInsts.size() != 1)
     return false;
 
-  CallInst *Call =
-    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  auto *Call = dyn_cast_or_null<CallInst>(*DepInsts.begin());
 
   // Check that the pointer is the return value of the call.
   if (!Call || Arg != Call)
@@ -2857,8 +2028,7 @@ FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
   if (DepInsts.size() != 1)
     return nullptr;
 
-  CallInst *Retain =
-    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  auto *Retain = dyn_cast_or_null<CallInst>(*DepInsts.begin());
 
   // Check that we found a retain with the same argument.
   if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) ||
@@ -2883,8 +2053,7 @@ FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
   if (DepInsts.size() != 1)
     return nullptr;
 
-  CallInst *Autorelease =
-    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  auto *Autorelease = dyn_cast_or_null<CallInst>(*DepInsts.begin());
   if (!Autorelease)
     return nullptr;
   ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease);
@@ -2999,28 +2168,13 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   if (!Run)
     return false;
 
-  // Identify the imprecise release metadata kind.
-  ImpreciseReleaseMDKind =
-    M.getContext().getMDKindID("clang.imprecise_release");
-  CopyOnEscapeMDKind =
-    M.getContext().getMDKindID("clang.arc.copy_on_escape");
-  NoObjCARCExceptionsMDKind =
-    M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
-#ifdef ARC_ANNOTATIONS
-  ARCAnnotationBottomUpMDKind =
-    M.getContext().getMDKindID("llvm.arc.annotation.bottomup");
-  ARCAnnotationTopDownMDKind =
-    M.getContext().getMDKindID("llvm.arc.annotation.topdown");
-  ARCAnnotationProvenanceSourceMDKind =
-    M.getContext().getMDKindID("llvm.arc.annotation.provenancesource");
-#endif // ARC_ANNOTATIONS
-
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
   // calls finalizers which can have arbitrary side effects.
+  MDKindCache.init(&M);
 
   // Initialize our runtime entry point cache.
-  EP.Initialize(&M);
+  EP.init(&M);
 
   return false;
 }
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
index 410abfc..15ad8dc 100644
--- a/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -32,20 +32,22 @@ using namespace llvm::objcarc;
 
 bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
                                        const Value *B) {
+  const DataLayout &DL = A->getModule()->getDataLayout();
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for relations between the values on corresponding arms.
   if (const SelectInst *SB = dyn_cast<SelectInst>(B))
     if (A->getCondition() == SB->getCondition())
-      return related(A->getTrueValue(), SB->getTrueValue()) ||
-             related(A->getFalseValue(), SB->getFalseValue());
+      return related(A->getTrueValue(), SB->getTrueValue(), DL) ||
+             related(A->getFalseValue(), SB->getFalseValue(), DL);
 
   // Check both arms of the Select node individually.
-  return related(A->getTrueValue(), B) ||
-         related(A->getFalseValue(), B);
+  return related(A->getTrueValue(), B, DL) ||
+         related(A->getFalseValue(), B, DL);
 }
 
 bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
                                     const Value *B) {
+  const DataLayout &DL = A->getModule()->getDataLayout();
   // If the values are PHIs in the same block, we can do a more precise as well
   // as efficient check: just check for relations between the values on
   // corresponding edges.
@@ -53,7 +55,7 @@ bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
     if (PNB->getParent() == A->getParent()) {
       for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
         if (related(A->getIncomingValue(i),
-                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
+                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i)), DL))
           return true;
       return false;
     }
@@ -62,7 +64,7 @@ bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
   SmallPtrSet<const Value *, 4> UniqueSrc;
   for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
     const Value *PV1 = A->getIncomingValue(i);
-    if (UniqueSrc.insert(PV1).second && related(PV1, B))
+    if (UniqueSrc.insert(PV1).second && related(PV1, B, DL))
       return true;
   }
 
@@ -103,11 +105,11 @@ static bool IsStoredObjCPointer(const Value *P) {
   return false;
 }
 
-bool ProvenanceAnalysis::relatedCheck(const Value *A,
-                                      const Value *B) {
+bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B,
+                                      const DataLayout &DL) {
   // Skip past provenance pass-throughs.
-  A = GetUnderlyingObjCPtr(A);
-  B = GetUnderlyingObjCPtr(B);
+  A = GetUnderlyingObjCPtr(A, DL);
+  B = GetUnderlyingObjCPtr(B, DL);
 
   // Quick check.
   if (A == B)
@@ -159,8 +161,8 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A,
   return true;
 }
 
-bool ProvenanceAnalysis::related(const Value *A,
-                                 const Value *B) {
+bool ProvenanceAnalysis::related(const Value *A, const Value *B,
+                                 const DataLayout &DL) {
   // Begin by inserting a conservative value into the map. If the insertion
   // fails, we have the answer already. If it succeeds, leave it there until we
   // compute the real answer to guard against recursive queries.
@@ -170,7 +172,7 @@ bool ProvenanceAnalysis::related(const Value *A,
   if (!Pair.second)
     return Pair.first->second;
 
-  bool Result = relatedCheck(A, B);
+  bool Result = relatedCheck(A, B, DL);
   CachedResults[ValuePairTy(A, B)] = Result;
   return Result;
 }
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 4b5f4d8..0ac41d3 100644
--- a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -30,6 +30,7 @@
 namespace llvm {
   class Value;
   class AliasAnalysis;
+  class DataLayout;
   class PHINode;
   class SelectInst;
 }
@@ -53,7 +54,7 @@ class ProvenanceAnalysis {
   typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
   CachedResultsTy CachedResults;
 
-  bool relatedCheck(const Value *A, const Value *B);
+  bool relatedCheck(const Value *A, const Value *B, const DataLayout &DL);
   bool relatedSelect(const SelectInst *A, const Value *B);
   bool relatedPHI(const PHINode *A, const Value *B);
 
@@ -67,7 +68,7 @@ public:
 
   AliasAnalysis *getAA() const { return AA; }
 
-  bool related(const Value *A, const Value *B);
+  bool related(const Value *A, const Value *B, const DataLayout &DL);
 
   void clear() {
     CachedResults.clear();
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
index d836632..0be75af 100644
--- a/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -65,6 +66,7 @@ bool PAEval::runOnFunction(Function &F) {
 
   ProvenanceAnalysis PA;
   PA.setAA(&getAnalysis<AliasAnalysis>());
+  const DataLayout &DL = F.getParent()->getDataLayout();
 
   for (Value *V1 : Values) {
     StringRef NameV1 = getName(V1);
@@ -73,7 +75,7 @@ bool PAEval::runOnFunction(Function &F) {
       if (NameV1 >= NameV2)
         continue;
       errs() << NameV1 << " and " << NameV2;
-      if (PA.related(V1, V2))
+      if (PA.related(V1, V2, DL))
         errs() << " are related.\n";
       else
         errs() << " are not related.\n";
diff --git a/lib/Transforms/ObjCARC/PtrState.cpp b/lib/Transforms/ObjCARC/PtrState.cpp
new file mode 100644
index 0000000..ae20e7e
--- /dev/null
+++ b/lib/Transforms/ObjCARC/PtrState.cpp
@@ -0,0 +1,404 @@
+//===--- PtrState.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PtrState.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-ptr-state"
+
+//===----------------------------------------------------------------------===//
+//                                  Utility
+//===----------------------------------------------------------------------===//
+
+raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, const Sequence S) {
+  switch (S) {
+  case S_None:
+    return OS << "S_None";
+  case S_Retain:
+    return OS << "S_Retain";
+  case S_CanRelease:
+    return OS << "S_CanRelease";
+  case S_Use:
+    return OS << "S_Use";
+  case S_Release:
+    return OS << "S_Release";
+  case S_MovableRelease:
+    return OS << "S_MovableRelease";
+  case S_Stop:
+    return OS << "S_Stop";
+  }
+  llvm_unreachable("Unknown sequence type.");
+}
+
+//===----------------------------------------------------------------------===//
+//                                  Sequence
+//===----------------------------------------------------------------------===//
+
+static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
+  // The easy cases.
+  if (A == B)
+    return A;
+  if (A == S_None || B == S_None)
+    return S_None;
+
+  if (A > B)
+    std::swap(A, B);
+  if (TopDown) {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
+      return B;
+  } else {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Use || A == S_CanRelease) &&
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
+      return A;
+    // If both sides are releases, choose the more conservative one.
+    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
+      return A;
+    if (A == S_Release && B == S_MovableRelease)
+      return A;
+  }
+
+  return S_None;
+}
+
+//===----------------------------------------------------------------------===//
+//                                   RRInfo
+//===----------------------------------------------------------------------===//
+
+void RRInfo::clear() {
+  KnownSafe = false;
+  IsTailCallRelease = false;
+  ReleaseMetadata = nullptr;
+  Calls.clear();
+  ReverseInsertPts.clear();
+  CFGHazardAfflicted = false;
+}
+
+bool RRInfo::Merge(const RRInfo &Other) {
+  // Conservatively merge the ReleaseMetadata information.
+  if (ReleaseMetadata != Other.ReleaseMetadata)
+    ReleaseMetadata = nullptr;
+
+  // Conservatively merge the boolean state.
+  KnownSafe &= Other.KnownSafe;
+  IsTailCallRelease &= Other.IsTailCallRelease;
+  CFGHazardAfflicted |= Other.CFGHazardAfflicted;
+
+  // Merge the call sets.
+  Calls.insert(Other.Calls.begin(), Other.Calls.end());
+
+  // Merge the insert point sets. If there are any differences,
+  // that makes this a partial merge.
+  bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size();
+  for (Instruction *Inst : Other.ReverseInsertPts)
+    Partial |= ReverseInsertPts.insert(Inst).second;
+  return Partial;
+}
+
+//===----------------------------------------------------------------------===//
+//                                  PtrState
+//===----------------------------------------------------------------------===//
+
+void PtrState::SetKnownPositiveRefCount() {
+  DEBUG(dbgs() << "        Setting Known Positive.\n");
+  KnownPositiveRefCount = true;
+}
+
+void PtrState::ClearKnownPositiveRefCount() {
+  DEBUG(dbgs() << "        Clearing Known Positive.\n");
+  KnownPositiveRefCount = false;
+}
+
+void PtrState::SetSeq(Sequence NewSeq) {
+  DEBUG(dbgs() << "            Old: " << GetSeq() << "; New: " << NewSeq << "\n");
+  Seq = NewSeq;
+}
+
+void PtrState::ResetSequenceProgress(Sequence NewSeq) {
+  DEBUG(dbgs() << "        Resetting sequence progress.\n");
+  SetSeq(NewSeq);
+  Partial = false;
+  RRI.clear();
+}
+
+void PtrState::Merge(const PtrState &Other, bool TopDown) {
+  Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown);
+  KnownPositiveRefCount &= Other.KnownPositiveRefCount;
+
+  // If we're not in a sequence (anymore), drop all associated state.
+  if (Seq == S_None) {
+    Partial = false;
+    RRI.clear();
+  } else if (Partial || Other.Partial) {
+    // If we're doing a merge on a path that's previously seen a partial
+    // merge, conservatively drop the sequence, to avoid doing partial
+    // RR elimination. If the branch predicates for the two merge differ,
+    // mixing them is unsafe.
+    ClearSequenceProgress();
+  } else {
+    // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this
+    // point, we know that currently we are not partial. Stash whether or not
+    // the merge operation caused us to undergo a partial merging of reverse
+    // insertion points.
+    Partial = RRI.Merge(Other.RRI);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                              BottomUpPtrState
+//===----------------------------------------------------------------------===//
+
+bool BottomUpPtrState::InitBottomUp(ARCMDKindCache &Cache, Instruction *I) {
+  // If we see two releases in a row on the same pointer. If so, make
+  // a note, and we'll cicle back to revisit it after we've
+  // hopefully eliminated the second release, which may allow us to
+  // eliminate the first release too.
+  // Theoretically we could implement removal of nested retain+release
+  // pairs by making PtrState hold a stack of states, but this is
+  // simple and avoids adding overhead for the non-nested case.
+  bool NestingDetected = false;
+  if (GetSeq() == S_Release || GetSeq() == S_MovableRelease) {
+    DEBUG(dbgs() << "        Found nested releases (i.e. a release pair)\n");
+    NestingDetected = true;
+  }
+
+  MDNode *ReleaseMetadata =
+      I->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
+  Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
+  ResetSequenceProgress(NewSeq);
+  SetReleaseMetadata(ReleaseMetadata);
+  SetKnownSafe(HasKnownPositiveRefCount());
+  SetTailCallRelease(cast<CallInst>(I)->isTailCall());
+  InsertCall(I);
+  SetKnownPositiveRefCount();
+  return NestingDetected;
+}
+
+bool BottomUpPtrState::MatchWithRetain() {
+  SetKnownPositiveRefCount();
+
+  Sequence OldSeq = GetSeq();
+  switch (OldSeq) {
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Use:
+    // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an
+    // imprecise release, clear our reverse insertion points.
+    if (OldSeq != S_Use || IsTrackingImpreciseReleases())
+      ClearReverseInsertPts();
+  // FALL THROUGH
+  case S_CanRelease:
+    return true;
+  case S_None:
+    return false;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  }
+  llvm_unreachable("Sequence unknown enum value");
+}
+
+bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
+                                                    const Value *Ptr,
+                                                    ProvenanceAnalysis &PA,
+                                                    ARCInstKind Class) {
+  Sequence S = GetSeq();
+
+  // Check for possible releases.
+  if (!CanAlterRefCount(Inst, Ptr, PA, Class))
+    return false;
+
+  DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << S << "; " << *Ptr
+               << "\n");
+  switch (S) {
+  case S_Use:
+    SetSeq(S_CanRelease);
+    return true;
+  case S_CanRelease:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Stop:
+  case S_None:
+    return false;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  }
+  llvm_unreachable("Sequence unknown enum value");
+}
+
+void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
+                                          const Value *Ptr,
+                                          ProvenanceAnalysis &PA,
+                                          ARCInstKind Class) {
+  // Check for possible direct uses.
+  switch (GetSeq()) {
+  case S_Release:
+  case S_MovableRelease:
+    if (CanUse(Inst, Ptr, PA, Class)) {
+      DEBUG(dbgs() << "            CanUse: Seq: " << GetSeq() << "; " << *Ptr
+                   << "\n");
+      assert(!HasReverseInsertPts());
+      // If this is an invoke instruction, we're scanning it as part of
+      // one of its successor blocks, since we can't insert code after it
+      // in its own block, and we don't want to split critical edges.
+      if (isa<InvokeInst>(Inst))
+        InsertReverseInsertPt(BB->getFirstInsertionPt());
+      else
+        InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst)));
+      SetSeq(S_Use);
+    } else if (Seq == S_Release && IsUser(Class)) {
+      DEBUG(dbgs() << "            PreciseReleaseUse: Seq: " << GetSeq() << "; "
+                   << *Ptr << "\n");
+      // Non-movable releases depend on any possible objc pointer use.
+      SetSeq(S_Stop);
+      assert(!HasReverseInsertPts());
+      // As above; handle invoke specially.
+      if (isa<InvokeInst>(Inst))
+        InsertReverseInsertPt(BB->getFirstInsertionPt());
+      else
+        InsertReverseInsertPt(std::next(BasicBlock::iterator(Inst)));
+    }
+    break;
+  case S_Stop:
+    if (CanUse(Inst, Ptr, PA, Class)) {
+      DEBUG(dbgs() << "            PreciseStopUse: Seq: " << GetSeq() << "; "
+                   << *Ptr << "\n");
+      SetSeq(S_Use);
+    }
+    break;
+  case S_CanRelease:
+  case S_Use:
+  case S_None:
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                              TopDownPtrState
+//===----------------------------------------------------------------------===//
+
+bool TopDownPtrState::InitTopDown(ARCInstKind Kind, Instruction *I) {
+  bool NestingDetected = false;
+  // Don't do retain+release tracking for ARCInstKind::RetainRV, because
+  // it's
+  // better to let it remain as the first instruction after a call.
+  if (Kind != ARCInstKind::RetainRV) {
+    // If we see two retains in a row on the same pointer. If so, make
+    // a note, and we'll cicle back to revisit it after we've
+    // hopefully eliminated the second retain, which may allow us to
+    // eliminate the first retain too.
+    // Theoretically we could implement removal of nested retain+release
+    // pairs by making PtrState hold a stack of states, but this is
+    // simple and avoids adding overhead for the non-nested case.
+    if (GetSeq() == S_Retain)
+      NestingDetected = true;
+
+    ResetSequenceProgress(S_Retain);
+    SetKnownSafe(HasKnownPositiveRefCount());
+    InsertCall(I);
+  }
+
+  SetKnownPositiveRefCount();
+  return NestingDetected;
+}
+
+bool TopDownPtrState::MatchWithRelease(ARCMDKindCache &Cache,
+                                       Instruction *Release) {
+  ClearKnownPositiveRefCount();
+
+  Sequence OldSeq = GetSeq();
+
+  MDNode *ReleaseMetadata =
+      Release->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
+
+  switch (OldSeq) {
+  case S_Retain:
+  case S_CanRelease:
+    if (OldSeq == S_Retain || ReleaseMetadata != nullptr)
+      ClearReverseInsertPts();
+  // FALL THROUGH
+  case S_Use:
+    SetReleaseMetadata(ReleaseMetadata);
+    SetTailCallRelease(cast<CallInst>(Release)->isTailCall());
+    return true;
+  case S_None:
+    return false;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    llvm_unreachable("top-down pointer in bottom up state!");
+  }
+  llvm_unreachable("Sequence unknown enum value");
+}
+
+bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
+                                                   const Value *Ptr,
+                                                   ProvenanceAnalysis &PA,
+                                                   ARCInstKind Class) {
+  // Check for possible releases.
+  if (!CanAlterRefCount(Inst, Ptr, PA, Class))
+    return false;
+
+  DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << GetSeq() << "; " << *Ptr
+               << "\n");
+  ClearKnownPositiveRefCount();
+  switch (GetSeq()) {
+  case S_Retain:
+    SetSeq(S_CanRelease);
+    assert(!HasReverseInsertPts());
+    InsertReverseInsertPt(Inst);
+
+    // One call can't cause a transition from S_Retain to S_CanRelease
+    // and S_CanRelease to S_Use. If we've made the first transition,
+    // we're done.
+    return true;
+  case S_Use:
+  case S_CanRelease:
+  case S_None:
+    return false;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    llvm_unreachable("top-down pointer in release state!");
+  }
+  llvm_unreachable("covered switch is not covered!?");
+}
+
+void TopDownPtrState::HandlePotentialUse(Instruction *Inst, const Value *Ptr,
+                                         ProvenanceAnalysis &PA,
+                                         ARCInstKind Class) {
+  // Check for possible direct uses.
+  switch (GetSeq()) {
+  case S_CanRelease:
+    if (!CanUse(Inst, Ptr, PA, Class))
+      return;
+    DEBUG(dbgs() << "             CanUse: Seq: " << GetSeq() << "; " << *Ptr
+                 << "\n");
+    SetSeq(S_Use);
+    return;
+  case S_Retain:
+  case S_Use:
+  case S_None:
+    return;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    llvm_unreachable("top-down pointer in release state!");
+  }
+}
diff --git a/lib/Transforms/ObjCARC/PtrState.h b/lib/Transforms/ObjCARC/PtrState.h
new file mode 100644
index 0000000..e45e1ea
--- /dev/null
+++ b/lib/Transforms/ObjCARC/PtrState.h
@@ -0,0 +1,210 @@
+//===--- PtrState.h - ARC State for a Ptr -------------------*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains declarations for the ARC state associated with a ptr. It
+//  is only used by the ARC Sequence Dataflow computation. By separating this
+//  from the actual dataflow, it is easier to consider the mechanics of the ARC
+//  optimization separate from the actual predicates being used.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
+
+#include "ARCInstKind.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm {
+namespace objcarc {
+
+class ARCMDKindCache;
+class ProvenanceAnalysis;
+
+/// \enum Sequence
+///
+/// \brief A sequence of states that a pointer may go through in which an
+/// objc_retain and objc_release are actually needed.
+enum Sequence {
+  S_None,
+  S_Retain,        ///< objc_retain(x).
+  S_CanRelease,    ///< foo(x) -- x could possibly see a ref count decrement.
+  S_Use,           ///< any use of x.
+  S_Stop,          ///< like S_Release, but code motion is stopped.
+  S_Release,       ///< objc_release(x).
+  S_MovableRelease ///< objc_release(x), !clang.imprecise_release.
+};
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const Sequence S) LLVM_ATTRIBUTE_UNUSED;
+
+/// \brief Unidirectional information about either a
+/// retain-decrement-use-release sequence or release-use-decrement-retain
+/// reverse sequence.
+struct RRInfo {
+  /// After an objc_retain, the reference count of the referenced
+  /// object is known to be positive. Similarly, before an objc_release, the
+  /// reference count of the referenced object is known to be positive. If
+  /// there are retain-release pairs in code regions where the retain count
+  /// is known to be positive, they can be eliminated, regardless of any side
+  /// effects between them.
+  ///
+  /// Also, a retain+release pair nested within another retain+release
+  /// pair all on the known same pointer value can be eliminated, regardless
+  /// of any intervening side effects.
+  ///
+  /// KnownSafe is true when either of these conditions is satisfied.
+  bool KnownSafe;
+
+  /// True of the objc_release calls are all marked with the "tail" keyword.
+  bool IsTailCallRelease;
+
+  /// If the Calls are objc_release calls and they all have a
+  /// clang.imprecise_release tag, this is the metadata tag.
+  MDNode *ReleaseMetadata;
+
+  /// For a top-down sequence, the set of objc_retains or
+  /// objc_retainBlocks. For bottom-up, the set of objc_releases.
+  SmallPtrSet<Instruction *, 2> Calls;
+
+  /// The set of optimal insert positions for moving calls in the opposite
+  /// sequence.
+  SmallPtrSet<Instruction *, 2> ReverseInsertPts;
+
+  /// If this is true, we cannot perform code motion but can still remove
+  /// retain/release pairs.
+  bool CFGHazardAfflicted;
+
+  RRInfo()
+      : KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(nullptr),
+        CFGHazardAfflicted(false) {}
+
+  void clear();
+
+  /// Conservatively merge the two RRInfo. Returns true if a partial merge has
+  /// occurred, false otherwise.
+  bool Merge(const RRInfo &Other);
+};
+
+/// \brief This class summarizes several per-pointer runtime properties which
+/// are propogated through the flow graph.
+class PtrState {
+protected:
+  /// True if the reference count is known to be incremented.
+  bool KnownPositiveRefCount;
+
+  /// True if we've seen an opportunity for partial RR elimination, such as
+  /// pushing calls into a CFG triangle or into one side of a CFG diamond.
+  bool Partial;
+
+  /// The current position in the sequence.
+  unsigned char Seq : 8;
+
+  /// Unidirectional information about the current sequence.
+  RRInfo RRI;
+
+  PtrState() : KnownPositiveRefCount(false), Partial(false), Seq(S_None) {}
+
+public:
+  bool IsKnownSafe() const { return RRI.KnownSafe; }
+
+  void SetKnownSafe(const bool NewValue) { RRI.KnownSafe = NewValue; }
+
+  bool IsTailCallRelease() const { return RRI.IsTailCallRelease; }
+
+  void SetTailCallRelease(const bool NewValue) {
+    RRI.IsTailCallRelease = NewValue;
+  }
+
+  bool IsTrackingImpreciseReleases() const {
+    return RRI.ReleaseMetadata != nullptr;
+  }
+
+  const MDNode *GetReleaseMetadata() const { return RRI.ReleaseMetadata; }
+
+  void SetReleaseMetadata(MDNode *NewValue) { RRI.ReleaseMetadata = NewValue; }
+
+  bool IsCFGHazardAfflicted() const { return RRI.CFGHazardAfflicted; }
+
+  void SetCFGHazardAfflicted(const bool NewValue) {
+    RRI.CFGHazardAfflicted = NewValue;
+  }
+
+  void SetKnownPositiveRefCount();
+  void ClearKnownPositiveRefCount();
+
+  bool HasKnownPositiveRefCount() const { return KnownPositiveRefCount; }
+
+  void SetSeq(Sequence NewSeq);
+
+  Sequence GetSeq() const { return static_cast<Sequence>(Seq); }
+
+  void ClearSequenceProgress() { ResetSequenceProgress(S_None); }
+
+  void ResetSequenceProgress(Sequence NewSeq);
+  void Merge(const PtrState &Other, bool TopDown);
+
+  void InsertCall(Instruction *I) { RRI.Calls.insert(I); }
+
+  void InsertReverseInsertPt(Instruction *I) { RRI.ReverseInsertPts.insert(I); }
+
+  void ClearReverseInsertPts() { RRI.ReverseInsertPts.clear(); }
+
+  bool HasReverseInsertPts() const { return !RRI.ReverseInsertPts.empty(); }
+
+  const RRInfo &GetRRInfo() const { return RRI; }
+};
+
+struct BottomUpPtrState : PtrState {
+  BottomUpPtrState() : PtrState() {}
+
+  /// (Re-)Initialize this bottom up pointer returning true if we detected a
+  /// pointer with nested releases.
+  bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I);
+
+  /// Return true if this set of releases can be paired with a release. Modifies
+  /// state appropriately to reflect that the matching occured if it is
+  /// successful.
+  ///
+  /// It is assumed that one has already checked that the RCIdentity of the
+  /// retain and the RCIdentity of this ptr state are the same.
+  bool MatchWithRetain();
+
+  void HandlePotentialUse(BasicBlock *BB, Instruction *Inst, const Value *Ptr,
+                          ProvenanceAnalysis &PA, ARCInstKind Class);
+  bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
+                                    ProvenanceAnalysis &PA, ARCInstKind Class);
+};
+
+struct TopDownPtrState : PtrState {
+  TopDownPtrState() : PtrState() {}
+
+  /// (Re-)Initialize this bottom up pointer returning true if we detected a
+  /// pointer with nested releases.
+  bool InitTopDown(ARCInstKind Kind, Instruction *I);
+
+  /// Return true if this set of retains can be paired with the given
+  /// release. Modifies state appropriately to reflect that the matching
+  /// occured.
+  bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release);
+
+  void HandlePotentialUse(Instruction *Inst, const Value *Ptr,
+                          ProvenanceAnalysis &PA, ARCInstKind Class);
+
+  bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
+                                    ProvenanceAnalysis &PA, ARCInstKind Class);
+};
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif
diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 5c74885..5aa2b97 100644
--- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -23,15 +23,15 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -71,7 +71,6 @@ struct AlignmentFromAssumptions : public FunctionPass {
 
   ScalarEvolution *SE;
   DominatorTree *DT;
-  const DataLayout *DL;
 
   bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV,
                             const SCEV *&OffSCEV);
@@ -123,7 +122,7 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
 
     // If the displacement is not an exact multiple, but the remainder is a
     // constant, then return this remainder (but only if it is a power of 2).
-    uint64_t DiffUnitsAbs = abs64(DiffUnits);
+    uint64_t DiffUnitsAbs = std::abs(DiffUnits);
     if (isPowerOf2_64(DiffUnitsAbs))
       return (unsigned) DiffUnitsAbs;
   }
@@ -316,7 +315,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
       continue;
 
     if (Instruction *K = dyn_cast<Instruction>(J))
-      if (isValidAssumeForContext(ACall, K, DL, DT))
+      if (isValidAssumeForContext(ACall, K, DT))
         WorkList.push_back(K);
   }
 
@@ -400,7 +399,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
     Visited.insert(J);
     for (User *UJ : J->users()) {
       Instruction *K = cast<Instruction>(UJ);
-      if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DL, DT))
+      if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT))
         WorkList.push_back(K);
     }
   }
@@ -413,8 +412,6 @@ bool AlignmentFromAssumptions::runOnFunction(Function &F) {
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   NewDestAlignments.clear();
   NewSrcAlignments.clear();
diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk
index ed803cd..cf30f39 100644
--- a/lib/Transforms/Scalar/Android.mk
+++ b/lib/Transforms/Scalar/Android.mk
@@ -20,6 +20,7 @@ transforms_scalar_SRC_FILES := \
   LoopDeletion.cpp \
   LoopIdiomRecognize.cpp \
   LoopInstSimplify.cpp \
+  LoopInterchange.cpp \
   LoopRerollPass.cpp \
   LoopRotation.cpp \
   LoopStrengthReduce.cpp \
diff --git a/lib/Transforms/Scalar/BDCE.cpp b/lib/Transforms/Scalar/BDCE.cpp
index c7bd79d..09c605e 100644
--- a/lib/Transforms/Scalar/BDCE.cpp
+++ b/lib/Transforms/Scalar/BDCE.cpp
@@ -64,7 +64,6 @@ struct BDCE : public FunctionPass {
                                 APInt &KnownZero2, APInt &KnownOne2);
 
   AssumptionCache *AC;
-  const DataLayout *DL;
   DominatorTree *DT;
 };
 }
@@ -95,20 +94,21 @@ void BDCE::determineLiveOperandBits(const Instruction *UserI,
   // however, want to do this twice, so we cache the result in APInts that live
   // in the caller. For the two-relevant-operands case, both operand values are
   // provided here.
-  auto ComputeKnownBits = [&](unsigned BitWidth, const Value *V1,
-                              const Value *V2) {
-    KnownZero = APInt(BitWidth, 0);
-    KnownOne =  APInt(BitWidth, 0);
-    computeKnownBits(const_cast<Value*>(V1), KnownZero, KnownOne, DL, 0, AC,
-                     UserI, DT);
-
-    if (V2) {
-      KnownZero2 = APInt(BitWidth, 0);
-      KnownOne2 =  APInt(BitWidth, 0);
-      computeKnownBits(const_cast<Value*>(V2), KnownZero2, KnownOne2, DL, 0, AC,
-                       UserI, DT);
-    }
-  };
+  auto ComputeKnownBits =
+      [&](unsigned BitWidth, const Value *V1, const Value *V2) {
+        const DataLayout &DL = I->getModule()->getDataLayout();
+        KnownZero = APInt(BitWidth, 0);
+        KnownOne = APInt(BitWidth, 0);
+        computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0,
+                         AC, UserI, DT);
+
+        if (V2) {
+          KnownZero2 = APInt(BitWidth, 0);
+          KnownOne2 = APInt(BitWidth, 0);
+          computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL,
+                           0, AC, UserI, DT);
+        }
+      };
 
   switch (UserI->getOpcode()) {
   default: break;
@@ -263,7 +263,6 @@ bool BDCE::runOnFunction(Function& F) {
     return false;
 
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  DL = F.getParent()->getDataLayout();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   DenseMap<Instruction *, APInt> AliveBits;
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index d297eb1..d12fdb7 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -18,6 +18,7 @@ add_llvm_library(LLVMScalarOpts
   LoopDeletion.cpp
   LoopIdiomRecognize.cpp
   LoopInstSimplify.cpp
+  LoopInterchange.cpp
   LoopRerollPass.cpp
   LoopRotation.cpp
   LoopStrengthReduce.cpp
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index e3aab4b..4288742 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include <tuple>
 
 using namespace llvm;
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 29d4e05..c974ebb 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -22,7 +22,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Constant.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
@@ -68,8 +67,7 @@ bool ConstantPropagation::runOnFunction(Function &F) {
       WorkList.insert(&*i);
   }
   bool Changed = false;
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
+  const DataLayout &DL = F.getParent()->getDataLayout();
   TargetLibraryInfo *TLI =
       &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 5a3b5cf..912d527 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -126,8 +127,9 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
     Changed = true;
   }
 
-  // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction.
-  if (Value *V = SimplifyInstruction(P)) {
+  // FIXME: Provide TLI, DT, AT to SimplifyInstruction.
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  if (Value *V = SimplifyInstruction(P, DL)) {
     P->replaceAllUsesWith(V);
     P->eraseFromParent();
     Changed = true;
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index c2ce1d5..cb8981b 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -33,7 +34,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -78,7 +79,8 @@ namespace {
     bool HandleFree(CallInst *F);
     bool handleEndBlock(BasicBlock &BB);
     void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
-                               SmallSetVector<Value*, 16> &DeadStackObjects);
+                               SmallSetVector<Value *, 16> &DeadStackObjects,
+                               const DataLayout &DL);
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
@@ -194,18 +196,12 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) {
 /// describe the memory operations for this instruction.
 static AliasAnalysis::Location
 getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
-  const DataLayout *DL = AA.getDataLayout();
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
     return AA.getLocation(SI);
 
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
     // memcpy/memmove/memset.
     AliasAnalysis::Location Loc = AA.getLocationForDest(MI);
-    // If we don't have target data around, an unknown size in Location means
-    // that we should use the size of the pointee type.  This isn't valid for
-    // memset/memcpy, which writes more than an i8.
-    if (Loc.Size == AliasAnalysis::UnknownSize && DL == nullptr)
-      return AliasAnalysis::Location();
     return Loc;
   }
 
@@ -215,11 +211,6 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
   switch (II->getIntrinsicID()) {
   default: return AliasAnalysis::Location(); // Unhandled intrinsic.
   case Intrinsic::init_trampoline:
-    // If we don't have target data around, an unknown size in Location means
-    // that we should use the size of the pointee type.  This isn't valid for
-    // init.trampoline, which writes more than an i8.
-    if (!DL) return AliasAnalysis::Location();
-
     // FIXME: We don't know the size of the trampoline, so we can't really
     // handle it here.
     return AliasAnalysis::Location(II->getArgOperand(0));
@@ -321,9 +312,10 @@ static Value *getStoredPointerOperand(Instruction *I) {
   return CS.getArgument(0);
 }
 
-static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
+static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
+                               const TargetLibraryInfo *TLI) {
   uint64_t Size;
-  if (getObjectSize(V, Size, AA.getDataLayout(), AA.getTargetLibraryInfo()))
+  if (getObjectSize(V, Size, DL, TLI))
     return Size;
   return AliasAnalysis::UnknownSize;
 }
@@ -343,10 +335,9 @@ namespace {
 /// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined
 static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
                                    const AliasAnalysis::Location &Earlier,
-                                   AliasAnalysis &AA,
-                                   int64_t &EarlierOff,
-                                   int64_t &LaterOff) {
-  const DataLayout *DL = AA.getDataLayout();
+                                   const DataLayout &DL,
+                                   const TargetLibraryInfo *TLI,
+                                   int64_t &EarlierOff, int64_t &LaterOff) {
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
 
@@ -367,7 +358,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
   // Otherwise, we have to have size information, and the later store has to be
   // larger than the earlier one.
   if (Later.Size == AliasAnalysis::UnknownSize ||
-      Earlier.Size == AliasAnalysis::UnknownSize || DL == nullptr)
+      Earlier.Size == AliasAnalysis::UnknownSize)
     return OverwriteUnknown;
 
   // Check to see if the later store is to the entire object (either a global,
@@ -382,7 +373,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
     return OverwriteUnknown;
 
   // If the "Later" store is to a recognizable object, get its size.
-  uint64_t ObjectSize = getPointerSize(UO2, AA);
+  uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
   if (ObjectSize != AliasAnalysis::UnknownSize)
     if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
       return OverwriteComplete;
@@ -560,8 +551,10 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       if (isRemovable(DepWrite) &&
           !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) {
         int64_t InstWriteOffset, DepWriteOffset;
-        OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA,
-                                         DepWriteOffset, InstWriteOffset);
+        const DataLayout &DL = BB.getModule()->getDataLayout();
+        OverwriteResult OR =
+            isOverwrite(Loc, DepLoc, DL, AA->getTargetLibraryInfo(),
+                        DepWriteOffset, InstWriteOffset);
         if (OR == OverwriteComplete) {
           DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
@@ -655,6 +648,7 @@ bool DSE::HandleFree(CallInst *F) {
   AliasAnalysis::Location Loc = AliasAnalysis::Location(F->getOperand(0));
   SmallVector<BasicBlock *, 16> Blocks;
   Blocks.push_back(F->getParent());
+  const DataLayout &DL = F->getModule()->getDataLayout();
 
   while (!Blocks.empty()) {
     BasicBlock *BB = Blocks.pop_back_val();
@@ -668,7 +662,7 @@ bool DSE::HandleFree(CallInst *F) {
         break;
 
       Value *DepPointer =
-        GetUnderlyingObject(getStoredPointerOperand(Dependency));
+          GetUnderlyingObject(getStoredPointerOperand(Dependency), DL);
 
       // Check for aliasing.
       if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
@@ -728,6 +722,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     if (AI->hasByValOrInAllocaAttr())
       DeadStackObjects.insert(AI);
 
+  const DataLayout &DL = BB.getModule()->getDataLayout();
+
   // Scan the basic block backwards
   for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
     --BBI;
@@ -736,7 +732,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) {
       // See through pointer-to-pointer bitcasts
       SmallVector<Value *, 4> Pointers;
-      GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers);
+      GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers, DL);
 
       // Stores to stack values are valid candidates for removal.
       bool AllDead = true;
@@ -799,8 +795,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       // the call is live.
       DeadStackObjects.remove_if([&](Value *I) {
         // See if the call site touches the value.
-        AliasAnalysis::ModRefResult A =
-            AA->getModRefInfo(CS, I, getPointerSize(I, *AA));
+        AliasAnalysis::ModRefResult A = AA->getModRefInfo(
+            CS, I, getPointerSize(I, DL, AA->getTargetLibraryInfo()));
 
         return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref;
       });
@@ -835,7 +831,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
     // Remove any allocas from the DeadPointer set that are loaded, as this
     // makes any stores above the access live.
-    RemoveAccessedObjects(LoadedLoc, DeadStackObjects);
+    RemoveAccessedObjects(LoadedLoc, DeadStackObjects, DL);
 
     // If all of the allocas were clobbered by the access then we're not going
     // to find anything else to process.
@@ -850,8 +846,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 /// of the stack objects in the DeadStackObjects set.  If so, they become live
 /// because the location is being loaded.
 void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
-                                SmallSetVector<Value*, 16> &DeadStackObjects) {
-  const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr);
+                                SmallSetVector<Value *, 16> &DeadStackObjects,
+                                const DataLayout &DL) {
+  const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
 
   // A constant can't be in the dead pointer set.
   if (isa<Constant>(UnderlyingPointer))
@@ -867,7 +864,8 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
   // Remove objects that could alias LoadedLoc.
   DeadStackObjects.remove_if([&](Value *I) {
     // See if the loaded location could alias the stack location.
-    AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA));
+    AliasAnalysis::Location StackLoc(
+        I, getPointerSize(I, DL, AA->getTargetLibraryInfo()));
     return !AA->isNoAlias(StackLoc, LoadedLoc);
   });
 }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 9309623..d5b9e03 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -27,7 +28,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <deque>
@@ -263,7 +264,6 @@ namespace {
 class EarlyCSE {
 public:
   Function &F;
-  const DataLayout *DL;
   const TargetLibraryInfo &TLI;
   const TargetTransformInfo &TTI;
   DominatorTree &DT;
@@ -308,11 +308,10 @@ public:
   unsigned CurrentGeneration;
 
   /// \brief Set up the EarlyCSE runner for a particular function.
-  EarlyCSE(Function &F, const DataLayout *DL, const TargetLibraryInfo &TLI,
+  EarlyCSE(Function &F, const TargetLibraryInfo &TLI,
            const TargetTransformInfo &TTI, DominatorTree &DT,
            AssumptionCache &AC)
-      : F(F), DL(DL), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {
-  }
+      : F(F), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {}
 
   bool run();
 
@@ -469,6 +468,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   Instruction *LastStore = nullptr;
 
   bool Changed = false;
+  const DataLayout &DL = BB->getModule()->getDataLayout();
 
   // See if any instructions in the block can be eliminated.  If so, do it.  If
   // not, add them to AvailableValues.
@@ -685,14 +685,12 @@ bool EarlyCSE::run() {
 
 PreservedAnalyses EarlyCSEPass::run(Function &F,
                                     AnalysisManager<Function> *AM) {
-  const DataLayout *DL = F.getParent()->getDataLayout();
-
   auto &TLI = AM->getResult<TargetLibraryAnalysis>(F);
   auto &TTI = AM->getResult<TargetIRAnalysis>(F);
   auto &DT = AM->getResult<DominatorTreeAnalysis>(F);
   auto &AC = AM->getResult<AssumptionAnalysis>(F);
 
-  EarlyCSE CSE(F, DL, TLI, TTI, DT, AC);
+  EarlyCSE CSE(F, TLI, TTI, DT, AC);
 
   if (!CSE.run())
     return PreservedAnalyses::all();
@@ -724,14 +722,12 @@ public:
     if (skipOptnoneFunction(F))
       return false;
 
-    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-    auto *DL = DLP ? &DLP->getDataLayout() : nullptr;
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
-    EarlyCSE CSE(F, DL, TLI, TTI, DT, AC);
+    EarlyCSE CSE(F, TLI, TTI, DT, AC);
 
     return CSE.run();
   }
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 73a1f25..c73e60f 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -45,7 +46,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -584,14 +585,13 @@ namespace {
   
     /// Emit code into this block to adjust the value defined here to the
     /// specified type. This handles various coercion cases.
-    Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const;
+    Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const;
   };
 
   class GVN : public FunctionPass {
     bool NoLoads;
     MemoryDependenceAnalysis *MD;
     DominatorTree *DT;
-    const DataLayout *DL;
     const TargetLibraryInfo *TLI;
     AssumptionCache *AC;
     SetVector<BasicBlock *> DeadBlocks;
@@ -630,7 +630,6 @@ namespace {
       InstrsToErase.push_back(I);
     }
 
-    const DataLayout *getDataLayout() const { return DL; }
     DominatorTree &getDominatorTree() const { return *DT; }
     AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
     MemoryDependenceAnalysis &getMemDep() const { return *MD; }
@@ -956,8 +955,9 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
     return -1;
 
   int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&DL);
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &DL);
+  Value *StoreBase =
+      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
   if (StoreBase != LoadBase)
     return -1;
 
@@ -1021,13 +1021,13 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
 /// This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.
 static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
-                                          StoreInst *DepSI,
-                                          const DataLayout &DL) {
+                                          StoreInst *DepSI) {
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepSI->getValueOperand()->getType()->isStructTy() ||
       DepSI->getValueOperand()->getType()->isArrayTy())
     return -1;
 
+  const DataLayout &DL = DepSI->getModule()->getDataLayout();
   Value *StorePtr = DepSI->getPointerOperand();
   uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
   return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
@@ -1052,11 +1052,11 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
   // then we should widen it!
   int64_t LoadOffs = 0;
   const Value *LoadBase =
-    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &DL);
+      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
   unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
 
-  unsigned Size = MemoryDependenceAnalysis::
-    getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, DL);
+  unsigned Size = MemoryDependenceAnalysis::getLoadLoadClobberFullWidthSize(
+      LoadBase, LoadOffs, LoadSize, DepLI);
   if (Size == 0) return -1;
 
   return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
@@ -1086,7 +1086,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   Constant *Src = dyn_cast<Constant>(MTI->getSource());
   if (!Src) return -1;
 
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &DL));
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
   if (!GV || !GV->isConstant()) return -1;
 
   // See if the access is within the bounds of the transfer.
@@ -1104,7 +1104,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  if (ConstantFoldLoadFromConstPtr(Src, &DL))
+  if (ConstantFoldLoadFromConstPtr(Src, DL))
     return Offset;
   return -1;
 }
@@ -1157,7 +1157,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
 static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
                                   Type *LoadTy, Instruction *InsertPt,
                                   GVN &gvn) {
-  const DataLayout &DL = *gvn.getDataLayout();
+  const DataLayout &DL = SrcVal->getModule()->getDataLayout();
   // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
   // widen SrcVal out to a larger load.
   unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
@@ -1265,7 +1265,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  return ConstantFoldLoadFromConstPtr(Src, &DL);
+  return ConstantFoldLoadFromConstPtr(Src, DL);
 }
 
 
@@ -1281,7 +1281,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
       gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
                                                LI->getParent())) {
     assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block");
-    return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn);
+    return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn);
   }
 
   // Otherwise, we have to construct SSA form.
@@ -1289,8 +1289,6 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   SSAUpdater SSAUpdate(&NewPHIs);
   SSAUpdate.Initialize(LI->getType(), LI->getName());
 
-  Type *LoadTy = LI->getType();
-
   for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
     const AvailableValueInBlock &AV = ValuesPerBlock[i];
     BasicBlock *BB = AV.BB;
@@ -1298,7 +1296,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
     if (SSAUpdate.HasValueForBlock(BB))
       continue;
 
-    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn));
+    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
   }
 
   // Perform PHI construction.
@@ -1326,16 +1324,16 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   return V;
 }
 
-Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
+Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI,
+                                                       GVN &gvn) const {
   Value *Res;
+  Type *LoadTy = LI->getType();
+  const DataLayout &DL = LI->getModule()->getDataLayout();
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
-      const DataLayout *DL = gvn.getDataLayout();
-      assert(DL && "Need target data to handle type mismatch case");
-      Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
-                                 *DL);
-  
+      Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), DL);
+
       DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                    << *getSimpleValue() << '\n'
                    << *Res << '\n' << "\n\n\n");
@@ -1353,10 +1351,8 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
                    << *Res << '\n' << "\n\n\n");
     }
   } else if (isMemIntrinValue()) {
-    const DataLayout *DL = gvn.getDataLayout();
-    assert(DL && "Need target data to handle type mismatch case");
-    Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
-                                 LoadTy, BB->getTerminator(), *DL);
+    Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+                                 BB->getTerminator(), DL);
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
                  << *Res << '\n' << "\n\n\n");
@@ -1383,6 +1379,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
   // dependencies that produce an unknown value for the load (such as a call
   // that could potentially clobber the load).
   unsigned NumDeps = Deps.size();
+  const DataLayout &DL = LI->getModule()->getDataLayout();
   for (unsigned i = 0, e = NumDeps; i != e; ++i) {
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
@@ -1409,9 +1406,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       // read by the load, we can extract the bits we need for the load from the
       // stored value.
       if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
-        if (DL && Address) {
-          int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address,
-                                                      DepSI, *DL);
+        if (Address) {
+          int Offset =
+              AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
           if (Offset != -1) {
             ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
                                                        DepSI->getValueOperand(),
@@ -1428,9 +1425,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
         // If this is a clobber and L is the first instruction in its block, then
         // we have the first instruction in the entry block.
-        if (DepLI != LI && Address && DL) {
-          int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), Address,
-                                                     DepLI, *DL);
+        if (DepLI != LI && Address) {
+          int Offset =
+              AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
 
           if (Offset != -1) {
             ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI,
@@ -1443,9 +1440,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       // If the clobbering value is a memset/memcpy/memmove, see if we can
       // forward a value on from it.
       if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
-        if (DL && Address) {
+        if (Address) {
           int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
-                                                        DepMI, *DL);
+                                                        DepMI, DL);
           if (Offset != -1) {
             ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI,
                                                                   Offset));
@@ -1484,8 +1481,8 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       if (S->getValueOperand()->getType() != LI->getType()) {
         // If the stored value is larger or equal to the loaded value, we can
         // reuse it.
-        if (!DL || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
-                                                    LI->getType(), *DL)) {
+        if (!CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+                                             LI->getType(), DL)) {
           UnavailableBlocks.push_back(DepBB);
           continue;
         }
@@ -1501,7 +1498,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       if (LD->getType() != LI->getType()) {
         // If the stored value is larger or equal to the loaded value, we can
         // reuse it.
-        if (!DL || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*DL)) {
+        if (!CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) {
           UnavailableBlocks.push_back(DepBB);
           continue;
         }
@@ -1613,6 +1610,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
   // Check if the load can safely be moved to all the unavailable predecessors.
   bool CanDoPRE = true;
+  const DataLayout &DL = LI->getModule()->getDataLayout();
   SmallVector<Instruction*, 8> NewInsts;
   for (auto &PredLoad : PredLoads) {
     BasicBlock *UnavailablePred = PredLoad.first;
@@ -1833,10 +1831,11 @@ bool GVN::processLoad(LoadInst *L) {
 
   // ... to a pointer that has been loaded from before...
   MemDepResult Dep = MD->getDependency(L);
+  const DataLayout &DL = L->getModule()->getDataLayout();
 
   // If we have a clobber and target data is around, see if this is a clobber
   // that we can fix up through code synthesis.
-  if (Dep.isClobber() && DL) {
+  if (Dep.isClobber()) {
     // Check to see if we have something like this:
     //   store i32 123, i32* %P
     //   %A = bitcast i32* %P to i8*
@@ -1849,12 +1848,11 @@ bool GVN::processLoad(LoadInst *L) {
     // access code.
     Value *AvailVal = nullptr;
     if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) {
-      int Offset = AnalyzeLoadFromClobberingStore(L->getType(),
-                                                  L->getPointerOperand(),
-                                                  DepSI, *DL);
+      int Offset = AnalyzeLoadFromClobberingStore(
+          L->getType(), L->getPointerOperand(), DepSI);
       if (Offset != -1)
         AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
-                                        L->getType(), L, *DL);
+                                        L->getType(), L, DL);
     }
 
     // Check to see if we have something like this:
@@ -1867,9 +1865,8 @@ bool GVN::processLoad(LoadInst *L) {
       if (DepLI == L)
         return false;
 
-      int Offset = AnalyzeLoadFromClobberingLoad(L->getType(),
-                                                 L->getPointerOperand(),
-                                                 DepLI, *DL);
+      int Offset = AnalyzeLoadFromClobberingLoad(
+          L->getType(), L->getPointerOperand(), DepLI, DL);
       if (Offset != -1)
         AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this);
     }
@@ -1877,11 +1874,10 @@ bool GVN::processLoad(LoadInst *L) {
     // If the clobbering value is a memset/memcpy/memmove, see if we can forward
     // a value on from it.
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
-      int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(),
-                                                    L->getPointerOperand(),
-                                                    DepMI, *DL);
+      int Offset = AnalyzeLoadFromClobberingMemInst(
+          L->getType(), L->getPointerOperand(), DepMI, DL);
       if (Offset != -1)
-        AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *DL);
+        AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, DL);
     }
 
     if (AvailVal) {
@@ -1932,17 +1928,13 @@ bool GVN::processLoad(LoadInst *L) {
     // actually have the same type.  See if we know how to reuse the stored
     // value (depending on its type).
     if (StoredVal->getType() != L->getType()) {
-      if (DL) {
-        StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(),
-                                                   L, *DL);
-        if (!StoredVal)
-          return false;
-
-        DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
-                     << '\n' << *L << "\n\n\n");
-      }
-      else
+      StoredVal =
+          CoerceAvailableValueToLoadType(StoredVal, L->getType(), L, DL);
+      if (!StoredVal)
         return false;
+
+      DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
+                   << '\n' << *L << "\n\n\n");
     }
 
     // Remove it!
@@ -1961,17 +1953,12 @@ bool GVN::processLoad(LoadInst *L) {
     // the same type.  See if we know how to reuse the previously loaded value
     // (depending on its type).
     if (DepLI->getType() != L->getType()) {
-      if (DL) {
-        AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(),
-                                                      L, *DL);
-        if (!AvailableVal)
-          return false;
-
-        DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
-                     << "\n" << *L << "\n\n\n");
-      }
-      else
+      AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), L, DL);
+      if (!AvailableVal)
         return false;
+
+      DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
+                   << "\n" << *L << "\n\n\n");
     }
 
     // Remove it!
@@ -2239,6 +2226,7 @@ bool GVN::processInstruction(Instruction *I) {
   // to value numbering it.  Value numbering often exposes redundancies, for
   // example if it determines that %y is equal to %x then the instruction
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
+  const DataLayout &DL = I->getModule()->getDataLayout();
   if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
     I->replaceAllUsesWith(V);
     if (MD && V->getType()->getScalarType()->isPointerTy())
@@ -2357,8 +2345,6 @@ bool GVN::runOnFunction(Function& F) {
   if (!NoLoads)
     MD = &getAnalysis<MemoryDependenceAnalysis>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index f99ebbc..51e8041 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -44,7 +45,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
@@ -73,7 +73,6 @@ namespace {
     LoopInfo                  *LI;
     ScalarEvolution           *SE;
     DominatorTree             *DT;
-    const DataLayout          *DL;
     TargetLibraryInfo         *TLI;
     const TargetTransformInfo *TTI;
 
@@ -82,8 +81,8 @@ namespace {
   public:
 
     static char ID; // Pass identification, replacement for typeid
-    IndVarSimplify() : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr),
-                       DL(nullptr), Changed(false) {
+    IndVarSimplify()
+        : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) {
       initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
     }
 
@@ -663,14 +662,14 @@ namespace {
 /// extended by this sign or zero extend operation. This is used to determine
 /// the final width of the IV before actually widening it.
 static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
-                        const DataLayout *DL, const TargetTransformInfo *TTI) {
+                        const TargetTransformInfo *TTI) {
   bool IsSigned = Cast->getOpcode() == Instruction::SExt;
   if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
     return;
 
   Type *Ty = Cast->getType();
   uint64_t Width = SE->getTypeSizeInBits(Ty);
-  if (DL && !DL->isLegalInteger(Width))
+  if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
     return;
 
   // Cast is either an sext or zext up to this point.
@@ -1201,7 +1200,6 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
 namespace {
   class IndVarSimplifyVisitor : public IVVisitor {
     ScalarEvolution *SE;
-    const DataLayout *DL;
     const TargetTransformInfo *TTI;
     PHINode *IVPhi;
 
@@ -1209,9 +1207,9 @@ namespace {
     WideIVInfo WI;
 
     IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
-                          const DataLayout *DL, const TargetTransformInfo *TTI,
+                          const TargetTransformInfo *TTI,
                           const DominatorTree *DTree)
-        : SE(SCEV), DL(DL), TTI(TTI), IVPhi(IV) {
+        : SE(SCEV), TTI(TTI), IVPhi(IV) {
       DT = DTree;
       WI.NarrowIV = IVPhi;
       if (ReduceLiveIVs)
@@ -1219,9 +1217,7 @@ namespace {
     }
 
     // Implement the interface used by simplifyUsersOfIV.
-    void visitCast(CastInst *Cast) override {
-      visitIVCast(Cast, WI, SE, DL, TTI);
-    }
+    void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
   };
 }
 
@@ -1255,7 +1251,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,
       PHINode *CurrIV = LoopPhis.pop_back_val();
 
       // Information about sign/zero extensions of CurrIV.
-      IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, TTI, DT);
+      IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
 
       Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor);
 
@@ -1521,9 +1517,8 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
 /// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
 /// This is difficult in general for SCEV because of potential overflow. But we
 /// could at least handle constant BECounts.
-static PHINode *
-FindLoopCounter(Loop *L, const SCEV *BECount,
-                ScalarEvolution *SE, DominatorTree *DT, const DataLayout *DL) {
+static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount,
+                                ScalarEvolution *SE, DominatorTree *DT) {
   uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
 
   Value *Cond =
@@ -1552,7 +1547,8 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
     // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
     // AR may not be a narrower type, or we may never exit.
     uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
-    if (PhiWidth < BCWidth || (DL && !DL->isLegalInteger(PhiWidth)))
+    if (PhiWidth < BCWidth ||
+        !L->getHeader()->getModule()->getDataLayout().isLegalInteger(PhiWidth))
       continue;
 
     const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
@@ -1705,51 +1701,15 @@ LinearFunctionTestReplace(Loop *L,
   // compare against the post-incremented value, otherwise we must compare
   // against the preincremented value.
   if (L->getExitingBlock() == L->getLoopLatch()) {
+    // Add one to the "backedge-taken" count to get the trip count.
+    // This addition may overflow, which is valid as long as the comparison is
+    // truncated to BackedgeTakenCount->getType().
+    IVCount = SE->getAddExpr(BackedgeTakenCount,
+                             SE->getConstant(BackedgeTakenCount->getType(), 1));
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
-    llvm::Value *IncrementedIndvar =
-        IndVar->getIncomingValueForBlock(L->getExitingBlock());
-    const auto *IncrementedIndvarSCEV =
-        cast<SCEVAddRecExpr>(SE->getSCEV(IncrementedIndvar));
-    // It is unsafe to use the incremented indvar if it has a wrapping flag, we
-    // don't want to compare against a poison value.  Check the SCEV that
-    // corresponds to the incremented indvar, the SCEVExpander will only insert
-    // flags in the IR if the SCEV originally had wrapping flags.
-    // FIXME: In theory, SCEV could drop flags even though they exist in IR.
-    // A more robust solution would involve getting a new expression for
-    // CmpIndVar by applying non-NSW/NUW AddExprs.
-    auto WrappingFlags =
-        ScalarEvolution::setFlags(SCEV::FlagNUW, SCEV::FlagNSW);
-    const SCEV *IVInit = IncrementedIndvarSCEV->getStart();
-    if (SE->getTypeSizeInBits(IVInit->getType()) >
-        SE->getTypeSizeInBits(IVCount->getType()))
-      IVInit = SE->getTruncateExpr(IVInit, IVCount->getType());
-    unsigned BitWidth = SE->getTypeSizeInBits(IVCount->getType());
-    Type *WideTy = IntegerType::get(SE->getContext(), BitWidth + 1);
-    // Check if InitIV + BECount+1 requires sign/zero extension.
-    // If not, clear the corresponding flag from WrappingFlags because it is not
-    // necessary for those flags in the IncrementedIndvarSCEV expression.
-    if (SE->getSignExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount),
-                              WideTy) ==
-        SE->getAddExpr(SE->getSignExtendExpr(IVInit, WideTy),
-                       SE->getSignExtendExpr(BackedgeTakenCount, WideTy)))
-      WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNSW);
-    if (SE->getZeroExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount),
-                              WideTy) ==
-        SE->getAddExpr(SE->getZeroExtendExpr(IVInit, WideTy),
-                       SE->getZeroExtendExpr(BackedgeTakenCount, WideTy)))
-      WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNUW);
-    if (!ScalarEvolution::maskFlags(IncrementedIndvarSCEV->getNoWrapFlags(),
-                                    WrappingFlags)) {
-      // Add one to the "backedge-taken" count to get the trip count.
-      // This addition may overflow, which is valid as long as the comparison is
-      // truncated to BackedgeTakenCount->getType().
-      IVCount =
-          SE->getAddExpr(BackedgeTakenCount,
-                         SE->getConstant(BackedgeTakenCount->getType(), 1));
-      CmpIndVar = IncrementedIndvar;
-    }
+    CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
   }
 
   Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
@@ -1932,12 +1892,11 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
   TLI = TLIP ? &TLIP->getTLI() : nullptr;
   auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
   TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
 
   DeadInsts.clear();
   Changed = false;
@@ -1949,7 +1908,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
 
   // Create a rewriter object which we'll use to transform the code with.
-  SCEVExpander Rewriter(*SE, "indvars");
+  SCEVExpander Rewriter(*SE, DL, "indvars");
 #ifndef NDEBUG
   Rewriter.setDebugType(DEBUG_TYPE);
 #endif
@@ -1978,7 +1937,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
   if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) {
-    PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, DL);
+    PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT);
     if (IndVar) {
       // Check preconditions for proper SCEVExpander operation. SCEV does not
       // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 8559e63..cbdacad 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -42,7 +42,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Optional.h"
-
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -51,27 +50,23 @@
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
-
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/Verifier.h"
-
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
-
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-
-#include "llvm/Pass.h"
-
 #include <array>
 
 using namespace llvm;
@@ -82,6 +77,9 @@ static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
 static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
                                        cl::init(false));
 
+static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
+                                      cl::init(false));
+
 static cl::opt<int> MaxExitProbReciprocal("irce-max-exit-prob-reciprocal",
                                           cl::Hidden, cl::init(10));
 
@@ -96,23 +94,41 @@ namespace {
 ///
 ///  and
 ///
-///  2. a condition that is provably true for some range of values taken by the
-///     containing loop's induction variable.
-///
-/// Currently all inductive range checks are branches conditional on an
-/// expression of the form
+///  2. a condition that is provably true for some contiguous range of values
+///     taken by the containing loop's induction variable.
 ///
-///   0 <= (Offset + Scale * I) < Length
-///
-/// where `I' is the canonical induction variable of a loop to which Offset and
-/// Scale are loop invariant, and Length is >= 0.  Currently the 'false' branch
-/// is considered cold, looking at profiling data to verify that is a TODO.
-
 class InductiveRangeCheck {
+  // Classifies a range check
+  enum RangeCheckKind : unsigned {
+    // Range check of the form "0 <= I".
+    RANGE_CHECK_LOWER = 1,
+
+    // Range check of the form "I < L" where L is known positive.
+    RANGE_CHECK_UPPER = 2,
+
+    // The logical and of the RANGE_CHECK_LOWER and RANGE_CHECK_UPPER
+    // conditions.
+    RANGE_CHECK_BOTH = RANGE_CHECK_LOWER | RANGE_CHECK_UPPER,
+
+    // Unrecognized range check condition.
+    RANGE_CHECK_UNKNOWN = (unsigned)-1
+  };
+
+  static const char *rangeCheckKindToStr(RangeCheckKind);
+
   const SCEV *Offset;
   const SCEV *Scale;
   Value *Length;
   BranchInst *Branch;
+  RangeCheckKind Kind;
+
+  static RangeCheckKind parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+                                            ScalarEvolution &SE, Value *&Index,
+                                            Value *&Length);
+
+  static InductiveRangeCheck::RangeCheckKind
+  parseRangeCheck(Loop *L, ScalarEvolution &SE, Value *Condition,
+                  const SCEV *&Index, Value *&UpperLimit);
 
   InductiveRangeCheck() :
     Offset(nullptr), Scale(nullptr), Length(nullptr), Branch(nullptr) { }
@@ -124,13 +140,17 @@ public:
 
   void print(raw_ostream &OS) const {
     OS << "InductiveRangeCheck:\n";
+    OS << "  Kind: " << rangeCheckKindToStr(Kind) << "\n";
     OS << "  Offset: ";
     Offset->print(OS);
     OS << "  Scale: ";
     Scale->print(OS);
     OS << "  Length: ";
-    Length->print(OS);
-    OS << "  Branch: ";
+    if (Length)
+      Length->print(OS);
+    else
+      OS << "(null)";
+    OS << "\n  Branch: ";
     getBranch()->print(OS);
     OS << "\n";
   }
@@ -207,160 +227,156 @@ char InductiveRangeCheckElimination::ID = 0;
 INITIALIZE_PASS(InductiveRangeCheckElimination, "irce",
                 "Inductive range check elimination", false, false)
 
-static bool IsLowerBoundCheck(Value *Check, Value *&IndexV) {
-  using namespace llvm::PatternMatch;
+const char *InductiveRangeCheck::rangeCheckKindToStr(
+    InductiveRangeCheck::RangeCheckKind RCK) {
+  switch (RCK) {
+  case InductiveRangeCheck::RANGE_CHECK_UNKNOWN:
+    return "RANGE_CHECK_UNKNOWN";
 
-  ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
-  Value *LHS = nullptr, *RHS = nullptr;
+  case InductiveRangeCheck::RANGE_CHECK_UPPER:
+    return "RANGE_CHECK_UPPER";
 
-  if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
-    return false;
+  case InductiveRangeCheck::RANGE_CHECK_LOWER:
+    return "RANGE_CHECK_LOWER";
+
+  case InductiveRangeCheck::RANGE_CHECK_BOTH:
+    return "RANGE_CHECK_BOTH";
+  }
+
+  llvm_unreachable("unknown range check type!");
+}
+
+/// Parse a single ICmp instruction, `ICI`, into a range check.  If `ICI`
+/// cannot
+/// be interpreted as a range check, return `RANGE_CHECK_UNKNOWN` and set
+/// `Index` and `Length` to `nullptr`.  Otherwise set `Index` to the value
+/// being
+/// range checked, and set `Length` to the upper limit `Index` is being range
+/// checked with if (and only if) the range check type is stronger or equal to
+/// RANGE_CHECK_UPPER.
+///
+InductiveRangeCheck::RangeCheckKind
+InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+                                         ScalarEvolution &SE, Value *&Index,
+                                         Value *&Length) {
+
+  auto IsNonNegativeAndNotLoopVarying = [&SE, L](Value *V) {
+    const SCEV *S = SE.getSCEV(V);
+    if (isa<SCEVCouldNotCompute>(S))
+      return false;
+
+    return SE.getLoopDisposition(S, L) == ScalarEvolution::LoopInvariant &&
+           SE.isKnownNonNegative(S);
+  };
+
+  using namespace llvm::PatternMatch;
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LHS = ICI->getOperand(0);
+  Value *RHS = ICI->getOperand(1);
 
   switch (Pred) {
   default:
-    return false;
+    return RANGE_CHECK_UNKNOWN;
 
   case ICmpInst::ICMP_SLE:
     std::swap(LHS, RHS);
   // fallthrough
   case ICmpInst::ICMP_SGE:
-    if (!match(RHS, m_ConstantInt<0>()))
-      return false;
-    IndexV = LHS;
-    return true;
+    if (match(RHS, m_ConstantInt<0>())) {
+      Index = LHS;
+      return RANGE_CHECK_LOWER;
+    }
+    return RANGE_CHECK_UNKNOWN;
 
   case ICmpInst::ICMP_SLT:
     std::swap(LHS, RHS);
   // fallthrough
   case ICmpInst::ICMP_SGT:
-    if (!match(RHS, m_ConstantInt<-1>()))
-      return false;
-    IndexV = LHS;
-    return true;
-  }
-}
-
-static bool IsUpperBoundCheck(Value *Check, Value *Index, Value *&UpperLimit) {
-  using namespace llvm::PatternMatch;
-
-  ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
-  Value *LHS = nullptr, *RHS = nullptr;
-
-  if (!match(Check, m_ICmp(Pred, m_Value(LHS), m_Value(RHS))))
-    return false;
+    if (match(RHS, m_ConstantInt<-1>())) {
+      Index = LHS;
+      return RANGE_CHECK_LOWER;
+    }
 
-  switch (Pred) {
-  default:
-    return false;
+    if (IsNonNegativeAndNotLoopVarying(LHS)) {
+      Index = RHS;
+      Length = LHS;
+      return RANGE_CHECK_UPPER;
+    }
+    return RANGE_CHECK_UNKNOWN;
 
-  case ICmpInst::ICMP_SGT:
+  case ICmpInst::ICMP_ULT:
     std::swap(LHS, RHS);
   // fallthrough
-  case ICmpInst::ICMP_SLT:
-    if (LHS != Index)
-      return false;
-    UpperLimit = RHS;
-    return true;
-
   case ICmpInst::ICMP_UGT:
-    std::swap(LHS, RHS);
-  // fallthrough
-  case ICmpInst::ICMP_ULT:
-    if (LHS != Index)
-      return false;
-    UpperLimit = RHS;
-    return true;
+    if (IsNonNegativeAndNotLoopVarying(LHS)) {
+      Index = RHS;
+      Length = LHS;
+      return RANGE_CHECK_BOTH;
+    }
+    return RANGE_CHECK_UNKNOWN;
   }
+
+  llvm_unreachable("default clause returns!");
 }
 
-/// Split a condition into something semantically equivalent to (0 <= I <
-/// Limit), both comparisons signed and Len loop invariant on L and positive.
-/// On success, return true and set Index to I and UpperLimit to Limit.  Return
-/// false on failure (we may still write to UpperLimit and Index on failure).
-/// It does not try to interpret I as a loop index.
-///
-static bool SplitRangeCheckCondition(Loop *L, ScalarEvolution &SE,
+/// Parses an arbitrary condition into a range check.  `Length` is set only if
+/// the range check is recognized to be `RANGE_CHECK_UPPER` or stronger.
+InductiveRangeCheck::RangeCheckKind
+InductiveRangeCheck::parseRangeCheck(Loop *L, ScalarEvolution &SE,
                                      Value *Condition, const SCEV *&Index,
-                                     Value *&UpperLimit) {
-
-  // TODO: currently this catches some silly cases like comparing "%idx slt 1".
-  // Our transformations are still correct, but less likely to be profitable in
-  // those cases.  We have to come up with some heuristics that pick out the
-  // range checks that are more profitable to clone a loop for.  This function
-  // in general can be made more robust.
-
+                                     Value *&Length) {
   using namespace llvm::PatternMatch;
 
   Value *A = nullptr;
   Value *B = nullptr;
-  ICmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
-
-  // In these early checks we assume that the matched UpperLimit is positive.
-  // We'll verify that fact later, before returning true.
 
   if (match(Condition, m_And(m_Value(A), m_Value(B)))) {
-    Value *IndexV = nullptr;
-    Value *ExpectedUpperBoundCheck = nullptr;
+    Value *IndexA = nullptr, *IndexB = nullptr;
+    Value *LengthA = nullptr, *LengthB = nullptr;
+    ICmpInst *ICmpA = dyn_cast<ICmpInst>(A), *ICmpB = dyn_cast<ICmpInst>(B);
 
-    if (IsLowerBoundCheck(A, IndexV))
-      ExpectedUpperBoundCheck = B;
-    else if (IsLowerBoundCheck(B, IndexV))
-      ExpectedUpperBoundCheck = A;
-    else
-      return false;
+    if (!ICmpA || !ICmpB)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
 
-    if (!IsUpperBoundCheck(ExpectedUpperBoundCheck, IndexV, UpperLimit))
-      return false;
+    auto RCKindA = parseRangeCheckICmp(L, ICmpA, SE, IndexA, LengthA);
+    auto RCKindB = parseRangeCheckICmp(L, ICmpB, SE, IndexB, LengthB);
 
-    Index = SE.getSCEV(IndexV);
+    if (RCKindA == InductiveRangeCheck::RANGE_CHECK_UNKNOWN ||
+        RCKindB == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
 
-    if (isa<SCEVCouldNotCompute>(Index))
-      return false;
+    if (IndexA != IndexB)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
 
-  } else if (match(Condition, m_ICmp(Pred, m_Value(A), m_Value(B)))) {
-    switch (Pred) {
-    default:
-      return false;
+    if (LengthA != nullptr && LengthB != nullptr && LengthA != LengthB)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
 
-    case ICmpInst::ICMP_SGT:
-      std::swap(A, B);
-    // fall through
-    case ICmpInst::ICMP_SLT:
-      UpperLimit = B;
-      Index = SE.getSCEV(A);
-      if (isa<SCEVCouldNotCompute>(Index) || !SE.isKnownNonNegative(Index))
-        return false;
-      break;
+    Index = SE.getSCEV(IndexA);
+    if (isa<SCEVCouldNotCompute>(Index))
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
 
-    case ICmpInst::ICMP_UGT:
-      std::swap(A, B);
-    // fall through
-    case ICmpInst::ICMP_ULT:
-      UpperLimit = B;
-      Index = SE.getSCEV(A);
-      if (isa<SCEVCouldNotCompute>(Index))
-        return false;
-      break;
-    }
-  } else {
-    return false;
+    Length = LengthA == nullptr ? LengthB : LengthA;
+
+    return (InductiveRangeCheck::RangeCheckKind)(RCKindA | RCKindB);
   }
 
-  const SCEV *UpperLimitSCEV = SE.getSCEV(UpperLimit);
-  if (isa<SCEVCouldNotCompute>(UpperLimitSCEV) ||
-      !SE.isKnownNonNegative(UpperLimitSCEV))
-    return false;
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+    Value *IndexVal = nullptr;
 
-  if (SE.getLoopDisposition(UpperLimitSCEV, L) !=
-      ScalarEvolution::LoopInvariant) {
-    DEBUG(dbgs() << " in function: " << L->getHeader()->getParent()->getName()
-                 << " ";
-          dbgs() << " UpperLimit is not loop invariant: "
-                 << UpperLimit->getName() << "\n";);
-    return false;
+    auto RCKind = parseRangeCheckICmp(L, ICI, SE, IndexVal, Length);
+
+    if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+    Index = SE.getSCEV(IndexVal);
+    if (isa<SCEVCouldNotCompute>(Index))
+      return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
+
+    return RCKind;
   }
 
-  return true;
+  return InductiveRangeCheck::RANGE_CHECK_UNKNOWN;
 }
 
 
@@ -380,10 +396,15 @@ InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI,
   Value *Length = nullptr;
   const SCEV *IndexSCEV = nullptr;
 
-  if (!SplitRangeCheckCondition(L, SE, BI->getCondition(), IndexSCEV, Length))
+  auto RCKind = InductiveRangeCheck::parseRangeCheck(L, SE, BI->getCondition(),
+                                                     IndexSCEV, Length);
+
+  if (RCKind == InductiveRangeCheck::RANGE_CHECK_UNKNOWN)
     return nullptr;
 
-  assert(IndexSCEV && Length && "contract with SplitRangeCheckCondition!");
+  assert(IndexSCEV && "contract with SplitRangeCheckCondition!");
+  assert((!(RCKind & InductiveRangeCheck::RANGE_CHECK_UPPER) || Length) &&
+         "contract with SplitRangeCheckCondition!");
 
   const SCEVAddRecExpr *IndexAddRec = dyn_cast<SCEVAddRecExpr>(IndexSCEV);
   bool IsAffineIndex =
@@ -397,6 +418,7 @@ InductiveRangeCheck::create(InductiveRangeCheck::AllocatorTy &A, BranchInst *BI,
   IRC->Offset = IndexAddRec->getStart();
   IRC->Scale = IndexAddRec->getStepRecurrence(SE);
   IRC->Branch = BI;
+  IRC->Kind = RCKind;
   return IRC;
 }
 
@@ -685,30 +707,40 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
     }
   }
 
-  auto IsInductionVar = [&SE](const SCEVAddRecExpr *AR, bool &IsIncreasing) {
-    if (!AR->isAffine())
-      return false;
+  auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
+    if (AR->getNoWrapFlags(SCEV::FlagNSW))
+      return true;
 
     IntegerType *Ty = cast<IntegerType>(AR->getType());
     IntegerType *WideTy =
         IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
 
-    // Currently we only work with induction variables that have been proved to
-    // not wrap.  This restriction can potentially be lifted in the future.
-
     const SCEVAddRecExpr *ExtendAfterOp =
         dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
-    if (!ExtendAfterOp)
-      return false;
+    if (ExtendAfterOp) {
+      const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+      const SCEV *ExtendedStep =
+          SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
 
-    const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
-    const SCEV *ExtendedStep =
-        SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+      bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+                          ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+      if (NoSignedWrap)
+        return true;
+    }
+
+    // We may have proved this when computing the sign extension above.
+    return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
+  };
+
+  auto IsInductionVar = [&](const SCEVAddRecExpr *AR, bool &IsIncreasing) {
+    if (!AR->isAffine())
+      return false;
 
-    bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
-                        ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+    // Currently we only work with induction variables that have been proved to
+    // not wrap.  This restriction can potentially be lifted in the future.
 
-    if (!NoSignedWrap)
+    if (!HasNoSignedWrap(AR))
       return false;
 
     if (const SCEVConstant *StepExpr =
@@ -791,9 +823,10 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
          "loop variant exit count doesn't make sense!");
 
   assert(!L.contains(LatchExit) && "expected an exit block!");
-
-  Value *IndVarStartV = SCEVExpander(SE, "irce").expandCodeFor(
-      IndVarStart, IndVarTy, &*Preheader->rbegin());
+  const DataLayout &DL = Preheader->getModule()->getDataLayout();
+  Value *IndVarStartV =
+      SCEVExpander(SE, DL, "irce")
+          .expandCodeFor(IndVarStart, IndVarTy, &*Preheader->rbegin());
   IndVarStartV->setName("indvar.start");
 
   LoopStructure Result;
@@ -831,12 +864,35 @@ LoopConstrainer::calculateSubRanges() const {
   const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
 
   bool Increasing = MainLoopStructure.IndVarIncreasing;
+
   // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
   // range of values the induction variable takes.
-  const SCEV *Smallest =
-      Increasing ? Start : SE.getAddExpr(End, SE.getSCEV(One));
-  const SCEV *Greatest =
-      Increasing ? End : SE.getAddExpr(Start, SE.getSCEV(One));
+
+  const SCEV *Smallest = nullptr, *Greatest = nullptr;
+
+  if (Increasing) {
+    Smallest = Start;
+    Greatest = End;
+  } else {
+    // These two computations may sign-overflow.  Here is why that is okay:
+    //
+    // We know that the induction variable does not sign-overflow on any
+    // iteration except the last one, and it starts at `Start` and ends at
+    // `End`, decrementing by one every time.
+    //
+    //  * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the
+    //    induction variable is decreasing we know that that the smallest value
+    //    the loop body is actually executed with is `INT_SMIN` == `Smallest`.
+    //
+    //  * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`.  In
+    //    that case, `Clamp` will always return `Smallest` and
+    //    [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
+    //    will be an empty range.  Returning an empty range is always safe.
+    //
+
+    Smallest = SE.getAddExpr(End, SE.getSCEV(One));
+    Greatest = SE.getAddExpr(Start, SE.getSCEV(One));
+  }
 
   auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
     return SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S));
@@ -1132,7 +1188,7 @@ bool LoopConstrainer::run() {
   IntegerType *IVTy =
       cast<IntegerType>(MainLoopStructure.IndVarNext->getType());
 
-  SCEVExpander Expander(SE, "irce");
+  SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
   Instruction *InsertPt = OriginalPreheader->getTerminator();
 
   // It would have been better to make `PreLoop' and `PostLoop'
@@ -1293,8 +1349,19 @@ InductiveRangeCheck::computeSafeIterationSpace(ScalarEvolution &SE,
   const SCEV *M = SE.getMinusSCEV(C, A);
 
   const SCEV *Begin = SE.getNegativeSCEV(M);
-  const SCEV *End = SE.getMinusSCEV(SE.getSCEV(getLength()), M);
+  const SCEV *UpperLimit = nullptr;
+
+  // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+  // We can potentially do much better here.
+  if (Value *V = getLength()) {
+    UpperLimit = SE.getSCEV(V);
+  } else {
+    assert(Kind == InductiveRangeCheck::RANGE_CHECK_LOWER && "invariant!");
+    unsigned BitWidth = cast<IntegerType>(IndVar->getType())->getBitWidth();
+    UpperLimit = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+  }
 
+  const SCEV *End = SE.getMinusSCEV(UpperLimit, M);
   return InductiveRangeCheck::Range(Begin, End);
 }
 
@@ -1344,12 +1411,18 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (RangeChecks.empty())
     return false;
 
-  DEBUG(dbgs() << "irce: looking at loop "; L->print(dbgs());
-        dbgs() << "irce: loop has " << RangeChecks.size()
-               << " inductive range checks: \n";
-        for (InductiveRangeCheck *IRC : RangeChecks)
-          IRC->print(dbgs());
-    );
+  auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) {
+    OS << "irce: looking at loop "; L->print(OS);
+    OS << "irce: loop has " << RangeChecks.size()
+       << " inductive range checks: \n";
+    for (InductiveRangeCheck *IRC : RangeChecks)
+      IRC->print(OS);
+  };
+
+  DEBUG(PrintRecognizedRangeChecks(dbgs()));
+
+  if (PrintRangeChecks)
+    PrintRecognizedRangeChecks(errs());
 
   const char *FailureReason = nullptr;
   Optional<LoopStructure> MaybeLoopStructure =
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 8b54abd..83ac915 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -32,7 +33,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -78,7 +78,6 @@ namespace {
   /// revectored to the false side of the second if.
   ///
   class JumpThreading : public FunctionPass {
-    const DataLayout *DL;
     TargetLibraryInfo *TLI;
     LazyValueInfo *LVI;
 #ifdef NDEBUG
@@ -159,8 +158,6 @@ bool JumpThreading::runOnFunction(Function &F) {
     return false;
 
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   LVI = &getAnalysis<LazyValueInfo>();
 
@@ -505,6 +502,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
     assert(Preference == WantInteger && "Compares only produce integers");
     PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0));
     if (PN && PN->getParent() == BB) {
+      const DataLayout &DL = PN->getModule()->getDataLayout();
       // We can do this simplification if any comparisons fold to true or false.
       // See if any do.
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
@@ -709,7 +707,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   // Run constant folding to see if we can reduce the condition to a simple
   // constant.
   if (Instruction *I = dyn_cast<Instruction>(Condition)) {
-    Value *SimpleVal = ConstantFoldInstruction(I, DL, TLI);
+    Value *SimpleVal =
+        ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
     if (SimpleVal) {
       I->replaceAllUsesWith(SimpleVal);
       I->eraseFromParent();
@@ -1521,7 +1520,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   // At this point, the IR is fully up to date and consistent.  Do a quick scan
   // over the new instructions and zap any that are constants or dead.  This
   // frequently happens because of phi translation.
-  SimplifyInstructionsInBlock(NewBB, DL, TLI);
+  SimplifyInstructionsInBlock(NewBB, TLI);
 
   // Threaded an edge!
   ++NumThreads;
@@ -1586,7 +1585,6 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   BasicBlock::iterator BI = BB->begin();
   for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
     ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
-
   // Clone the non-phi instructions of BB into PredBB, keeping track of the
   // mapping and using it to remap operands in the cloned instructions.
   for (; BI != BB->end(); ++BI) {
@@ -1603,7 +1601,8 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
     // If this instruction can be simplified after the operands are updated,
     // just use the simplified value instead.  This frequently happens due to
     // phi translation.
-    if (Value *IV = SimplifyInstruction(New, DL)) {
+    if (Value *IV =
+            SimplifyInstruction(New, BB->getModule()->getDataLayout())) {
       delete New;
       ValueMapping[BI] = IV;
     } else {
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 14af38b..1333b02 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -52,7 +53,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -76,21 +76,21 @@ static bool isNotUsedInLoop(Instruction &I, Loop *CurLoop);
 static bool hoist(Instruction &I, BasicBlock *Preheader);
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, 
                  Loop *CurLoop, AliasSetTracker *CurAST );
-static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT, 
-                                  Loop *CurLoop, LICMSafetyInfo * SafetyInfo); 
-static bool isSafeToExecuteUnconditionally(Instruction &Inst,DominatorTree *DT, 
-                                           const DataLayout *DL, Loop *CurLoop,
-                                           LICMSafetyInfo * SafetyInfo);
+static bool isGuaranteedToExecute(Instruction &Inst, DominatorTree *DT,
+                                  Loop *CurLoop, LICMSafetyInfo *SafetyInfo);
+static bool isSafeToExecuteUnconditionally(Instruction &Inst, DominatorTree *DT,
+                                           Loop *CurLoop,
+                                           LICMSafetyInfo *SafetyInfo);
 static bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
                                      const AAMDNodes &AAInfo, 
                                      AliasSetTracker *CurAST);
 static Instruction *CloneInstructionInExitBlock(Instruction &I,
                                                 BasicBlock &ExitBlock,
                                                 PHINode &PN, LoopInfo *LI);
-static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, 
-                               DominatorTree *DT, const DataLayout *DL, 
-                               Loop *CurLoop, AliasSetTracker *CurAST, 
-                               LICMSafetyInfo * SafetyInfo);
+static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
+                               DominatorTree *DT, Loop *CurLoop,
+                               AliasSetTracker *CurAST,
+                               LICMSafetyInfo *SafetyInfo);
 
 namespace {
   struct LICM : public LoopPass {
@@ -130,7 +130,6 @@ namespace {
     LoopInfo      *LI;       // Current LoopInfo
     DominatorTree *DT;       // Dominator Tree for the current Loop.
 
-    const DataLayout *DL;    // DataLayout for constant folding.
     TargetLibraryInfo *TLI;  // TargetLibraryInfo for constant folding.
 
     // State that is updated as we process loops.
@@ -181,8 +180,6 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   AA = &getAnalysis<AliasAnalysis>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -235,10 +232,10 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   // instructions, we perform another pass to hoist them out of the loop.
   //
   if (L->hasDedicatedExits())
-    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, 
-                          CurLoop, CurAST, &SafetyInfo);
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, CurLoop,
+                          CurAST, &SafetyInfo);
   if (Preheader)
-    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, DL, TLI, 
+    Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI,
                            CurLoop, CurAST, &SafetyInfo);
 
   // Now that all loop invariants have been removed from the loop, promote any
@@ -291,10 +288,9 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
 /// first order w.r.t the DominatorTree.  This allows us to visit uses before
 /// definitions, allowing us to sink a loop body in one pass without iteration.
 ///
-bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, 
-                      DominatorTree *DT, const DataLayout *DL, 
-                      TargetLibraryInfo *TLI, Loop *CurLoop, 
-                      AliasSetTracker *CurAST, LICMSafetyInfo * SafetyInfo) { 
+bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+                      DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                      AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
 
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && 
@@ -311,8 +307,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
   // We are processing blocks in reverse dfo, so process children first.
   const std::vector<DomTreeNode*> &Children = N->getChildren();
   for (unsigned i = 0, e = Children.size(); i != e; ++i)
-    Changed |= sinkRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, 
-                          CurAST, SafetyInfo);
+    Changed |=
+        sinkRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
   // Only need to process the contents of this block if it is not part of a
   // subloop (which would already have been processed).
   if (inSubLoop(BB,CurLoop,LI)) return Changed;
@@ -336,8 +332,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
     // outside of the loop.  In this case, it doesn't even matter if the
     // operands of the instruction are loop invariant.
     //
-    if (isNotUsedInLoop(I, CurLoop) && 
-        canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo)) {
+    if (isNotUsedInLoop(I, CurLoop) &&
+        canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo)) {
       ++II;
       Changed |= sink(I, LI, DT, CurLoop, CurAST);
     }
@@ -350,10 +346,9 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 /// order w.r.t the DominatorTree.  This allows us to visit definitions before
 /// uses, allowing us to hoist a loop body in one pass without iteration.
 ///
-bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, 
-                       DominatorTree *DT, const DataLayout *DL, 
-                       TargetLibraryInfo *TLI, Loop *CurLoop,
-                       AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) { 
+bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+                       DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                       AliasSetTracker *CurAST, LICMSafetyInfo *SafetyInfo) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && 
          DT != nullptr && CurLoop != nullptr && CurAST != nullptr && 
@@ -372,7 +367,8 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       // Try constant folding this instruction.  If all the operands are
       // constants, it is technically hoistable, but it would be better to just
       // fold it.
-      if (Constant *C = ConstantFoldInstruction(&I, DL, TLI)) {
+      if (Constant *C = ConstantFoldInstruction(
+              &I, I.getModule()->getDataLayout(), TLI)) {
         DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
         CurAST->copyValue(&I, C);
         CurAST->deleteValue(&I);
@@ -385,16 +381,16 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       // if all of the operands of the instruction are loop invariant and if it
       // is safe to hoist the instruction.
       //
-      if (CurLoop->hasLoopInvariantOperands(&I) && 
-          canSinkOrHoistInst(I, AA, DT, DL, CurLoop, CurAST, SafetyInfo) &&
-          isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo))
+      if (CurLoop->hasLoopInvariantOperands(&I) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo) &&
+          isSafeToExecuteUnconditionally(I, DT, CurLoop, SafetyInfo))
         Changed |= hoist(I, CurLoop->getLoopPreheader());
     }
 
   const std::vector<DomTreeNode*> &Children = N->getChildren();
   for (unsigned i = 0, e = Children.size(); i != e; ++i)
-    Changed |= hoistRegion(Children[i], AA, LI, DT, DL, TLI, CurLoop, 
-                           CurAST, SafetyInfo); 
+    Changed |=
+        hoistRegion(Children[i], AA, LI, DT, TLI, CurLoop, CurAST, SafetyInfo);
   return Changed;
 }
 
@@ -424,10 +420,9 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) {
 /// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
 /// instruction.
 ///
-bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, 
-                        DominatorTree *DT, const DataLayout *DL, 
-                        Loop *CurLoop, AliasSetTracker *CurAST, 
-                        LICMSafetyInfo * SafetyInfo) {
+bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT,
+                        Loop *CurLoop, AliasSetTracker *CurAST,
+                        LICMSafetyInfo *SafetyInfo) {
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     if (!LI->isUnordered())
@@ -487,7 +482,7 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
       !isa<InsertValueInst>(I))
     return false;
 
-  return isSafeToExecuteUnconditionally(I, DT, DL, CurLoop, SafetyInfo);
+  return isSafeToExecuteUnconditionally(I, DT, CurLoop, SafetyInfo);
 }
 
 /// Returns true if a PHINode is a trivially replaceable with an
@@ -643,10 +638,10 @@ static bool hoist(Instruction &I, BasicBlock *Preheader) {
 /// or if it is a trapping instruction and is guaranteed to execute.
 ///
 static bool isSafeToExecuteUnconditionally(Instruction &Inst, DominatorTree *DT,
-                                           const DataLayout *DL, Loop *CurLoop, 
-                                           LICMSafetyInfo * SafetyInfo) {
+                                           Loop *CurLoop,
+                                           LICMSafetyInfo *SafetyInfo) {
   // If it is not a trapping instruction, it is always safe to hoist.
-  if (isSafeToSpeculativelyExecute(&Inst, DL))
+  if (isSafeToSpeculativelyExecute(&Inst))
     return true;
 
   return isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 11e4d76..1f33f72 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -12,17 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
-
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/TargetFolder.h"
-#include "llvm/Pass.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -52,13 +52,10 @@ struct LoadPOPPair {
 
 class LoadCombine : public BasicBlockPass {
   LLVMContext *C;
-  const DataLayout *DL;
   AliasAnalysis *AA;
 
 public:
-  LoadCombine()
-      : BasicBlockPass(ID),
-        C(nullptr), DL(nullptr), AA(nullptr) {
+  LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
     initializeSROAPass(*PassRegistry::getPassRegistry());
   }
   
@@ -85,12 +82,6 @@ private:
 bool LoadCombine::doInitialization(Function &F) {
   DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n");
   C = &F.getContext();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP) {
-    DEBUG(dbgs() << "  Skipping LoadCombine -- no target data!\n");
-    return false;
-  }
-  DL = &DLP->getDataLayout();
   return true;
 }
 
@@ -100,9 +91,10 @@ PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) {
   POP.Offset = 0;
   while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) {
-      unsigned BitWidth = DL->getPointerTypeSizeInBits(GEP->getType());
+      auto &DL = LI.getModule()->getDataLayout();
+      unsigned BitWidth = DL.getPointerTypeSizeInBits(GEP->getType());
       APInt Offset(BitWidth, 0);
-      if (GEP->accumulateConstantOffset(*DL, Offset))
+      if (GEP->accumulateConstantOffset(DL, Offset))
         POP.Offset += Offset.getZExtValue();
       else
         // Can't handle GEPs with variable indices.
@@ -145,7 +137,8 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
     if (PrevOffset == -1ull) {
       BaseLoad = L.Load;
       PrevOffset = L.POP.Offset;
-      PrevSize = DL->getTypeStoreSize(L.Load->getType());
+      PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
+          L.Load->getType());
       AggregateLoads.push_back(L);
       continue;
     }
@@ -164,7 +157,8 @@ bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
       // FIXME: We may want to handle this case.
       continue;
     PrevOffset = L.POP.Offset;
-    PrevSize = DL->getTypeStoreSize(L.Load->getType());
+    PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
+        L.Load->getType());
     AggregateLoads.push_back(L);
   }
   if (combineLoads(AggregateLoads))
@@ -215,7 +209,8 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
   for (const auto &L : Loads) {
     Builder->SetInsertPoint(L.Load);
     Value *V = Builder->CreateExtractInteger(
-        *DL, NewLoad, cast<IntegerType>(L.Load->getType()),
+        L.Load->getModule()->getDataLayout(), NewLoad,
+        cast<IntegerType>(L.Load->getType()),
         L.POP.Offset - Loads[0].POP.Offset, "combine.extract");
     L.Load->replaceAllUsesWith(V);
   }
@@ -225,13 +220,13 @@ bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
 }
 
 bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
-  if (skipOptnoneFunction(BB) || !DL)
+  if (skipOptnoneFunction(BB))
     return false;
 
   AA = &getAnalysis<AliasAnalysis>();
 
-  IRBuilder<true, TargetFolder>
-  TheBuilder(BB.getContext(), TargetFolder(DL));
+  IRBuilder<true, TargetFolder> TheBuilder(
+      BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
   Builder = &TheBuilder;
 
   DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 243c624..7bc2917 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -47,6 +47,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
@@ -56,7 +57,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -130,7 +130,6 @@ namespace {
 
   class LoopIdiomRecognize : public LoopPass {
     Loop *CurLoop;
-    const DataLayout *DL;
     DominatorTree *DT;
     ScalarEvolution *SE;
     TargetLibraryInfo *TLI;
@@ -139,7 +138,10 @@ namespace {
     static char ID;
     explicit LoopIdiomRecognize() : LoopPass(ID) {
       initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
-      DL = nullptr; DT = nullptr; SE = nullptr; TLI = nullptr; TTI = nullptr;
+      DT = nullptr;
+      SE = nullptr;
+      TLI = nullptr;
+      TTI = nullptr;
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
@@ -179,14 +181,6 @@ namespace {
       AU.addRequired<TargetTransformInfoWrapperPass>();
     }
 
-    const DataLayout *getDataLayout() {
-      if (DL)
-        return DL;
-      DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-      DL = DLP ? &DLP->getDataLayout() : nullptr;
-      return DL;
-    }
-
     DominatorTree *getDominatorTree() {
       return DT ? DT
                 : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree());
@@ -625,10 +619,6 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
     if (BECst->getValue()->getValue() == 0)
       return false;
 
-  // We require target data for now.
-  if (!getDataLayout())
-    return false;
-
   // set DT
   (void)getDominatorTree();
 
@@ -742,7 +732,8 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
   Value *StorePtr = SI->getPointerOperand();
 
   // Reject stores that are so large that they overflow an unsigned.
-  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+  auto &DL = CurLoop->getHeader()->getModule()->getDataLayout();
+  uint64_t SizeInBits = DL.getTypeSizeInBits(StoredVal->getType());
   if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
     return false;
 
@@ -917,7 +908,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   // but it can be turned into memset_pattern if the target supports it.
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = nullptr;
-
+  auto &DL = CurLoop->getHeader()->getModule()->getDataLayout();
   unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
 
   // If we're allowed to form a memset, and the stored value would be acceptable
@@ -928,9 +919,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
       CurLoop->isLoopInvariant(SplatValue)) {
     // Keep and use SplatValue.
     PatternValue = nullptr;
-  } else if (DestAS == 0 &&
-             TLI->has(LibFunc::memset_pattern16) &&
-             (PatternValue = getMemSetPatternValue(StoredVal, *DL))) {
+  } else if (DestAS == 0 && TLI->has(LibFunc::memset_pattern16) &&
+             (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
     // Don't create memset_pattern16s with address spaces.
     // It looks like we can use PatternValue!
     SplatValue = nullptr;
@@ -945,7 +935,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   // header.  This allows us to insert code for it in the preheader.
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   IRBuilder<> Builder(Preheader->getTerminator());
-  SCEVExpander Expander(*SE, "loop-idiom");
+  SCEVExpander Expander(*SE, DL, "loop-idiom");
 
   Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
 
@@ -1005,7 +995,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
     // an constant array of 16-bytes.  Plop the value into a mergable global.
     GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
-                                            GlobalValue::InternalLinkage,
+                                            GlobalValue::PrivateLinkage,
                                             PatternValue, ".memset_pattern");
     GV->setUnnamedAddr(true); // Ok to merge these.
     GV->setAlignment(16);
@@ -1042,7 +1032,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
   // header.  This allows us to insert code for it in the preheader.
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   IRBuilder<> Builder(Preheader->getTerminator());
-  SCEVExpander Expander(*SE, "loop-idiom");
+  const DataLayout &DL = Preheader->getModule()->getDataLayout();
+  SCEVExpander Expander(*SE, DL, "loop-idiom");
 
   // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
   // this into a memcpy in the loop preheader now if we want.  However, this
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 6dc600e..e125026 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -77,8 +77,6 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   const TargetLibraryInfo *TLI =
       &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
@@ -110,6 +108,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
       WorklistItem Item = VisitStack.pop_back_val();
       BasicBlock *BB = Item.getPointer();
       bool IsSubloopHeader = Item.getInt();
+      const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
 
       // Simplify instructions in the current basic block.
       for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
new file mode 100644
index 0000000..f7626c5
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -0,0 +1,1154 @@
+//===- LoopInterchange.cpp - Loop interchange pass------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This Pass handles loop interchange transform.
+// This pass interchanges loops to provide a more cache-friendly memory access
+// patterns.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-interchange"
+
+namespace {
+
+typedef SmallVector<Loop *, 8> LoopVector;
+
+// TODO: Check if we can use a sparse matrix here.
+typedef std::vector<std::vector<char>> CharMatrix;
+
+// Maximum number of dependencies that can be handled in the dependency matrix.
+static const unsigned MaxMemInstrCount = 100;
+
+// Maximum loop depth supported.
+static const unsigned MaxLoopNestDepth = 10;
+
+struct LoopInterchange;
+
+#ifdef DUMP_DEP_MATRICIES
+void printDepMatrix(CharMatrix &DepMatrix) {
+  for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) {
+    std::vector<char> Vec = *I;
+    for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II)
+      DEBUG(dbgs() << *II << " ");
+    DEBUG(dbgs() << "\n");
+  }
+}
+#endif
+
+bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L,
+                              DependenceAnalysis *DA) {
+  typedef SmallVector<Value *, 16> ValueVector;
+  ValueVector MemInstr;
+
+  if (Level > MaxLoopNestDepth) {
+    DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+                 << MaxLoopNestDepth << "\n");
+    return false;
+  }
+
+  // For each block.
+  for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
+       BB != BE; ++BB) {
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E;
+         ++I) {
+      Instruction *Ins = dyn_cast<Instruction>(I);
+      if (!Ins)
+        return false;
+      LoadInst *Ld = dyn_cast<LoadInst>(I);
+      StoreInst *St = dyn_cast<StoreInst>(I);
+      if (!St && !Ld)
+        continue;
+      if (Ld && !Ld->isSimple())
+        return false;
+      if (St && !St->isSimple())
+        return false;
+      MemInstr.push_back(I);
+    }
+  }
+
+  DEBUG(dbgs() << "Found " << MemInstr.size()
+               << " Loads and Stores to analyze\n");
+
+  ValueVector::iterator I, IE, J, JE;
+
+  for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
+    for (J = I, JE = MemInstr.end(); J != JE; ++J) {
+      std::vector<char> Dep;
+      Instruction *Src = dyn_cast<Instruction>(*I);
+      Instruction *Des = dyn_cast<Instruction>(*J);
+      if (Src == Des)
+        continue;
+      if (isa<LoadInst>(Src) && isa<LoadInst>(Des))
+        continue;
+      if (auto D = DA->depends(Src, Des, true)) {
+        DEBUG(dbgs() << "Found Dependency between Src=" << Src << " Des=" << Des
+                     << "\n");
+        if (D->isFlow()) {
+          // TODO: Handle Flow dependence.Check if it is sufficient to populate
+          // the Dependence Matrix with the direction reversed.
+          DEBUG(dbgs() << "Flow dependence not handled");
+          return false;
+        }
+        if (D->isAnti()) {
+          DEBUG(dbgs() << "Found Anti dependence \n");
+          unsigned Levels = D->getLevels();
+          char Direction;
+          for (unsigned II = 1; II <= Levels; ++II) {
+            const SCEV *Distance = D->getDistance(II);
+            const SCEVConstant *SCEVConst =
+                dyn_cast_or_null<SCEVConstant>(Distance);
+            if (SCEVConst) {
+              const ConstantInt *CI = SCEVConst->getValue();
+              if (CI->isNegative())
+                Direction = '<';
+              else if (CI->isZero())
+                Direction = '=';
+              else
+                Direction = '>';
+              Dep.push_back(Direction);
+            } else if (D->isScalar(II)) {
+              Direction = 'S';
+              Dep.push_back(Direction);
+            } else {
+              unsigned Dir = D->getDirection(II);
+              if (Dir == Dependence::DVEntry::LT ||
+                  Dir == Dependence::DVEntry::LE)
+                Direction = '<';
+              else if (Dir == Dependence::DVEntry::GT ||
+                       Dir == Dependence::DVEntry::GE)
+                Direction = '>';
+              else if (Dir == Dependence::DVEntry::EQ)
+                Direction = '=';
+              else
+                Direction = '*';
+              Dep.push_back(Direction);
+            }
+          }
+          while (Dep.size() != Level) {
+            Dep.push_back('I');
+          }
+
+          DepMatrix.push_back(Dep);
+          if (DepMatrix.size() > MaxMemInstrCount) {
+            DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+                         << " dependencies inside loop\n");
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  // We don't have a DepMatrix to check legality return false
+  if (DepMatrix.size() == 0)
+    return false;
+  return true;
+}
+
+// A loop is moved from index 'from' to an index 'to'. Update the Dependence
+// matrix by exchanging the two columns.
+void interChangeDepedencies(CharMatrix &DepMatrix, unsigned FromIndx,
+                            unsigned ToIndx) {
+  unsigned numRows = DepMatrix.size();
+  for (unsigned i = 0; i < numRows; ++i) {
+    char TmpVal = DepMatrix[i][ToIndx];
+    DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
+    DepMatrix[i][FromIndx] = TmpVal;
+  }
+}
+
+// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
+// '>'
+bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
+                            unsigned Column) {
+  for (unsigned i = 0; i <= Column; ++i) {
+    if (DepMatrix[Row][i] == '<')
+      return false;
+    if (DepMatrix[Row][i] == '>')
+      return true;
+  }
+  // All dependencies were '=','S' or 'I'
+  return false;
+}
+
+// Checks if no dependence exist in the dependency matrix in Row before Column.
+bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
+                          unsigned Column) {
+  for (unsigned i = 0; i < Column; ++i) {
+    if (DepMatrix[Row][i] != '=' || DepMatrix[Row][i] != 'S' ||
+        DepMatrix[Row][i] != 'I')
+      return false;
+  }
+  return true;
+}
+
+bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
+                         unsigned OuterLoopId, char InnerDep, char OuterDep) {
+
+  if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
+    return false;
+
+  if (InnerDep == OuterDep)
+    return true;
+
+  // It is legal to interchange if and only if after interchange no row has a
+  // '>' direction as the leftmost non-'='.
+
+  if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
+    return true;
+
+  if (InnerDep == '<')
+    return true;
+
+  if (InnerDep == '>') {
+    // If OuterLoopId represents outermost loop then interchanging will make the
+    // 1st dependency as '>'
+    if (OuterLoopId == 0)
+      return false;
+
+    // If all dependencies before OuterloopId are '=','S'or 'I'. Then
+    // interchanging will result in this row having an outermost non '='
+    // dependency of '>'
+    if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
+      return true;
+  }
+
+  return false;
+}
+
+// Checks if it is legal to interchange 2 loops.
+// [Theorm] A permutation of the loops in a perfect nest is legal if and only if
+// the direction matrix, after the same permutation is applied to its columns,
+// has no ">" direction as the leftmost non-"=" direction in any row.
+bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId,
+                               unsigned OuterLoopId) {
+
+  unsigned NumRows = DepMatrix.size();
+  // For each row check if it is valid to interchange.
+  for (unsigned Row = 0; Row < NumRows; ++Row) {
+    char InnerDep = DepMatrix[Row][InnerLoopId];
+    char OuterDep = DepMatrix[Row][OuterLoopId];
+    if (InnerDep == '*' || OuterDep == '*')
+      return false;
+    else if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep,
+                                  OuterDep))
+      return false;
+  }
+  return true;
+}
+
+static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
+
+  DEBUG(dbgs() << "Calling populateWorklist called\n");
+  LoopVector LoopList;
+  Loop *CurrentLoop = &L;
+  std::vector<Loop *> vec = CurrentLoop->getSubLoopsVector();
+  while (vec.size() != 0) {
+    // The current loop has multiple subloops in it hence it is not tightly
+    // nested.
+    // Discard all loops above it added into Worklist.
+    if (vec.size() != 1) {
+      LoopList.clear();
+      return;
+    }
+    LoopList.push_back(CurrentLoop);
+    CurrentLoop = *(vec.begin());
+    vec = CurrentLoop->getSubLoopsVector();
+  }
+  LoopList.push_back(CurrentLoop);
+  V.push_back(LoopList);
+}
+
+static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
+  PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
+  if (InnerIndexVar)
+    return InnerIndexVar;
+  if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
+    return nullptr;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PhiVar = cast<PHINode>(I);
+    Type *PhiTy = PhiVar->getType();
+    if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+        !PhiTy->isPointerTy())
+      return nullptr;
+    const SCEVAddRecExpr *AddRec =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
+    if (!AddRec || !AddRec->isAffine())
+      continue;
+    const SCEV *Step = AddRec->getStepRecurrence(*SE);
+    const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+    if (!C)
+      continue;
+    // Found the induction variable.
+    // FIXME: Handle loops with more than one induction variable. Note that,
+    // currently, legality makes sure we have only one induction variable.
+    return PhiVar;
+  }
+  return nullptr;
+}
+
+/// LoopInterchangeLegality checks if it is legal to interchange the loop.
+class LoopInterchangeLegality {
+public:
+  LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                          LoopInterchange *Pass)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), CurrentPass(Pass) {}
+
+  /// Check if the loops can be interchanged.
+  bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
+                           CharMatrix &DepMatrix);
+  /// Check if the loop structure is understood. We do not handle triangular
+  /// loops for now.
+  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+
+  bool currentLimitations();
+
+private:
+  bool tightlyNested(Loop *Outer, Loop *Inner);
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  LoopInterchange *CurrentPass;
+};
+
+/// LoopInterchangeProfitability checks if it is profitable to interchange the
+/// loop.
+class LoopInterchangeProfitability {
+public:
+  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+
+  /// Check if the loop interchange is profitable
+  bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
+                    CharMatrix &DepMatrix);
+
+private:
+  int getInstrOrderCost();
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  /// Scev analysis.
+  ScalarEvolution *SE;
+};
+
+/// LoopInterchangeTransform interchanges the loop
+class LoopInterchangeTransform {
+public:
+  LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                           LoopInfo *LI, DominatorTree *DT,
+                           LoopInterchange *Pass, BasicBlock *LoopNestExit)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+        LoopExit(LoopNestExit) {}
+
+  /// Interchange OuterLoop and InnerLoop.
+  bool transform();
+  void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
+  void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
+
+private:
+  void splitInnerLoopLatch(Instruction *);
+  void splitOuterLoopLatch();
+  void splitInnerLoopHeader();
+  bool adjustLoopLinks();
+  void adjustLoopPreheaders();
+  void adjustOuterLoopPreheader();
+  void adjustInnerLoopPreheader();
+  bool adjustLoopBranches();
+
+  Loop *OuterLoop;
+  Loop *InnerLoop;
+
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  BasicBlock *LoopExit;
+};
+
+// Main LoopInterchange Pass
+struct LoopInterchange : public FunctionPass {
+  static char ID;
+  ScalarEvolution *SE;
+  LoopInfo *LI;
+  DependenceAnalysis *DA;
+  DominatorTree *DT;
+  LoopInterchange()
+      : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) {
+    initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DependenceAnalysis>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+  }
+
+  bool runOnFunction(Function &F) override {
+    SE = &getAnalysis<ScalarEvolution>();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    DA = &getAnalysis<DependenceAnalysis>();
+    auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    // Build up a worklist of loop pairs to analyze.
+    SmallVector<LoopVector, 8> Worklist;
+
+    for (Loop *L : *LI)
+      populateWorklist(*L, Worklist);
+
+    DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
+    bool Changed = true;
+    while (!Worklist.empty()) {
+      LoopVector LoopList = Worklist.pop_back_val();
+      Changed = processLoopList(LoopList);
+    }
+    return Changed;
+  }
+
+  bool isComputableLoopNest(LoopVector LoopList) {
+    for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) {
+      Loop *L = *I;
+      const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
+      if (ExitCountOuter == SE->getCouldNotCompute()) {
+        DEBUG(dbgs() << "Couldn't compute Backedge count\n");
+        return false;
+      }
+      if (L->getNumBackEdges() != 1) {
+        DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+        return false;
+      }
+      if (!L->getExitingBlock()) {
+        DEBUG(dbgs() << "Loop Doesn't have unique exit block\n");
+        return false;
+      }
+    }
+    return true;
+  }
+
+  unsigned selectLoopForInterchange(LoopVector LoopList) {
+    // TODO: Add a better heuristic to select the loop to be interchanged based
+    // on the dependece matrix. Currently we select the innermost loop.
+    return LoopList.size() - 1;
+  }
+
+  bool processLoopList(LoopVector LoopList) {
+    bool Changed = false;
+    bool containsLCSSAPHI = false;
+    CharMatrix DependencyMatrix;
+    if (LoopList.size() < 2) {
+      DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+      return false;
+    }
+    if (!isComputableLoopNest(LoopList)) {
+      DEBUG(dbgs() << "Not vaild loop candidate for interchange\n");
+      return false;
+    }
+    Loop *OuterMostLoop = *(LoopList.begin());
+
+    DEBUG(dbgs() << "Processing LoopList of size = " << LoopList.size()
+                 << "\n");
+
+    if (!populateDependencyMatrix(DependencyMatrix, LoopList.size(),
+                                  OuterMostLoop, DA)) {
+      DEBUG(dbgs() << "Populating Dependency matrix failed\n");
+      return false;
+    }
+#ifdef DUMP_DEP_MATRICIES
+    DEBUG(dbgs() << "Dependence before inter change \n");
+    printDepMatrix(DependencyMatrix);
+#endif
+
+    BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
+    BranchInst *OuterMostLoopLatchBI =
+        dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
+    if (!OuterMostLoopLatchBI)
+      return false;
+
+    // Since we currently do not handle LCSSA PHI's any failure in loop
+    // condition will now branch to LoopNestExit.
+    // TODO: This should be removed once we handle LCSSA PHI nodes.
+
+    // Get the Outermost loop exit.
+    BasicBlock *LoopNestExit;
+    if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
+      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
+    else
+      LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
+
+    for (auto I = LoopList.begin(), E = LoopList.end(); I != E; ++I) {
+      Loop *L = *I;
+      BasicBlock *Latch = L->getLoopLatch();
+      BasicBlock *Header = L->getHeader();
+      if (Latch && Latch != Header && isa<PHINode>(Latch->begin())) {
+        containsLCSSAPHI = true;
+        break;
+      }
+    }
+
+    // TODO: Handle lcssa PHI's. Currently LCSSA PHI's are not handled. Handle
+    // the same by splitting the loop latch and adjusting loop links
+    // accordingly.
+    if (containsLCSSAPHI)
+      return false;
+
+    unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+    // Move the selected loop outwards to the best posible position.
+    for (unsigned i = SelecLoopId; i > 0; i--) {
+      bool Interchanged =
+          processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
+      if (!Interchanged)
+        return Changed;
+      // Loops interchanged reflect the same in LoopList
+      std::swap(LoopList[i - 1], LoopList[i]);
+
+      // Update the DependencyMatrix
+      interChangeDepedencies(DependencyMatrix, i, i - 1);
+
+#ifdef DUMP_DEP_MATRICIES
+      DEBUG(dbgs() << "Dependence after inter change \n");
+      printDepMatrix(DependencyMatrix);
+#endif
+      Changed |= Interchanged;
+    }
+    return Changed;
+  }
+
+  bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
+                   unsigned OuterLoopId, BasicBlock *LoopNestExit,
+                   std::vector<std::vector<char>> &DependencyMatrix) {
+
+    DEBUG(dbgs() << "Processing Innder Loop Id = " << InnerLoopId
+                 << " and OuterLoopId = " << OuterLoopId << "\n");
+    Loop *InnerLoop = LoopList[InnerLoopId];
+    Loop *OuterLoop = LoopList[OuterLoopId];
+
+    LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, this);
+    if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+      DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
+      return false;
+    }
+    DEBUG(dbgs() << "Loops are legal to interchange\n");
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+    if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+      DEBUG(dbgs() << "Interchanging Loops not profitable\n");
+      return false;
+    }
+
+    LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this,
+                                 LoopNestExit);
+    LIT.transform();
+    DEBUG(dbgs() << "Loops interchanged\n");
+    return true;
+  }
+};
+
+} // end of namespace
+
+static bool containsUnsafeInstructions(BasicBlock *BB) {
+  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
+    if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      return true;
+  }
+  return false;
+}
+
+bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+  DEBUG(dbgs() << "Checking if Loops are Tightly Nested\n");
+
+  // A perfectly nested loop will not have any branch in between the outer and
+  // inner block i.e. outer header will branch to either inner preheader and
+  // outerloop latch.
+  BranchInst *outerLoopHeaderBI =
+      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+  if (!outerLoopHeaderBI)
+    return false;
+  unsigned num = outerLoopHeaderBI->getNumSuccessors();
+  for (unsigned i = 0; i < num; i++) {
+    if (outerLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
+        outerLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
+      return false;
+  }
+
+  DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n");
+  // We do not have any basic block in between now make sure the outer header
+  // and outer loop latch doesnt contain any unsafe instructions.
+  if (containsUnsafeInstructions(OuterLoopHeader) ||
+      containsUnsafeInstructions(OuterLoopLatch))
+    return false;
+
+  DEBUG(dbgs() << "Loops are perfectly nested \n");
+  // We have a perfect loop nest.
+  return true;
+}
+
+static unsigned getPHICount(BasicBlock *BB) {
+  unsigned PhiCount = 0;
+  for (auto I = BB->begin(); isa<PHINode>(I); ++I)
+    PhiCount++;
+  return PhiCount;
+}
+
+bool LoopInterchangeLegality::isLoopStructureUnderstood(
+    PHINode *InnerInduction) {
+
+  unsigned Num = InnerInduction->getNumOperands();
+  BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
+  for (unsigned i = 0; i < Num; ++i) {
+    Value *Val = InnerInduction->getOperand(i);
+    if (isa<Constant>(Val))
+      continue;
+    Instruction *I = dyn_cast<Instruction>(Val);
+    if (!I)
+      return false;
+    // TODO: Handle triangular loops.
+    // e.g. for(int i=0;i<N;i++)
+    //        for(int j=i;j<N;j++)
+    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+    if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+            InnerLoopPreheader &&
+        !OuterLoop->isLoopInvariant(I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// This function indicates the current limitations in the transform as a result
+// of which we do not proceed.
+bool LoopInterchangeLegality::currentLimitations() {
+
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+  PHINode *InnerInductionVar;
+  PHINode *OuterInductionVar;
+
+  // We currently handle only 1 induction variable inside the loop. We also do
+  // not handle reductions as of now.
+  if (getPHICount(InnerLoopHeader) > 1)
+    return true;
+
+  if (getPHICount(OuterLoopHeader) > 1)
+    return true;
+
+  InnerInductionVar = getInductionVariable(InnerLoop, SE);
+  OuterInductionVar = getInductionVariable(OuterLoop, SE);
+
+  if (!OuterInductionVar || !InnerInductionVar) {
+    DEBUG(dbgs() << "Induction variable not found\n");
+    return true;
+  }
+
+  // TODO: Triangular loops are not handled for now.
+  if (!isLoopStructureUnderstood(InnerInductionVar)) {
+    DEBUG(dbgs() << "Loop structure not understood by pass\n");
+    return true;
+  }
+
+  // TODO: Loops with LCSSA PHI's are currently not handled.
+  if (isa<PHINode>(OuterLoopLatch->begin())) {
+    DEBUG(dbgs() << "Found and LCSSA PHI in outer loop latch\n");
+    return true;
+  }
+  if (InnerLoopLatch != InnerLoopHeader &&
+      isa<PHINode>(InnerLoopLatch->begin())) {
+    DEBUG(dbgs() << "Found and LCSSA PHI in inner loop latch\n");
+    return true;
+  }
+
+  // TODO: Current limitation: Since we split the inner loop latch at the point
+  // were induction variable is incremented (induction.next); We cannot have
+  // more than 1 user of induction.next since it would result in broken code
+  // after split.
+  // e.g.
+  // for(i=0;i<N;i++) {
+  //    for(j = 0;j<M;j++) {
+  //      A[j+1][i+2] = A[j][i]+k;
+  //  }
+  // }
+  bool FoundInduction = false;
+  Instruction *InnerIndexVarInc = nullptr;
+  if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
+    InnerIndexVarInc =
+        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
+  else
+    InnerIndexVarInc =
+        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
+
+  if (!InnerIndexVarInc)
+    return true;
+
+  // Since we split the inner loop latch on this induction variable. Make sure
+  // we do not have any instruction between the induction variable and branch
+  // instruction.
+
+  for (auto I = InnerLoopLatch->rbegin(), E = InnerLoopLatch->rend();
+       I != E && !FoundInduction; ++I) {
+    if (isa<BranchInst>(*I) || isa<CmpInst>(*I) || isa<TruncInst>(*I))
+      continue;
+    const Instruction &Ins = *I;
+    // We found an instruction. If this is not induction variable then it is not
+    // safe to split this loop latch.
+    if (!Ins.isIdenticalTo(InnerIndexVarInc))
+      return true;
+    else
+      FoundInduction = true;
+  }
+  // The loop latch ended and we didnt find the induction variable return as
+  // current limitation.
+  if (!FoundInduction)
+    return true;
+
+  return false;
+}
+
+bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
+                                                  unsigned OuterLoopId,
+                                                  CharMatrix &DepMatrix) {
+
+  if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
+    DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+                 << "and OuterLoopId = " << OuterLoopId
+                 << "due to dependence\n");
+    return false;
+  }
+
+  // Create unique Preheaders if we already do not have one.
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+
+  // Create  a unique outer preheader -
+  // 1) If OuterLoop preheader is not present.
+  // 2) If OuterLoop Preheader is same as OuterLoop Header
+  // 3) If OuterLoop Preheader is same as Header of the previous loop.
+  // 4) If OuterLoop Preheader is Entry node.
+  if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
+      isa<PHINode>(OuterLoopPreHeader->begin()) ||
+      !OuterLoopPreHeader->getUniquePredecessor()) {
+    OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, CurrentPass);
+  }
+
+  if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
+      InnerLoopPreHeader == OuterLoop->getHeader()) {
+    InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, CurrentPass);
+  }
+
+  // Check if the loops are tightly nested.
+  if (!tightlyNested(OuterLoop, InnerLoop)) {
+    DEBUG(dbgs() << "Loops not tightly nested\n");
+    return false;
+  }
+
+  // TODO: The loops could not be interchanged due to current limitations in the
+  // transform module.
+  if (currentLimitations()) {
+    DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+    return false;
+  }
+
+  return true;
+}
+
+int LoopInterchangeProfitability::getInstrOrderCost() {
+  unsigned GoodOrder, BadOrder;
+  BadOrder = GoodOrder = 0;
+  for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
+       BI != BE; ++BI) {
+    for (auto I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) {
+      const Instruction &Ins = *I;
+      if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
+        unsigned NumOp = GEP->getNumOperands();
+        bool FoundInnerInduction = false;
+        bool FoundOuterInduction = false;
+        for (unsigned i = 0; i < NumOp; ++i) {
+          const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
+          const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
+          if (!AR)
+            continue;
+
+          // If we find the inner induction after an outer induction e.g.
+          // for(int i=0;i<N;i++)
+          //   for(int j=0;j<N;j++)
+          //     A[i][j] = A[i-1][j-1]+k;
+          // then it is a good order.
+          if (AR->getLoop() == InnerLoop) {
+            // We found an InnerLoop induction after OuterLoop induction. It is
+            // a good order.
+            FoundInnerInduction = true;
+            if (FoundOuterInduction) {
+              GoodOrder++;
+              break;
+            }
+          }
+          // If we find the outer induction after an inner induction e.g.
+          // for(int i=0;i<N;i++)
+          //   for(int j=0;j<N;j++)
+          //     A[j][i] = A[j-1][i-1]+k;
+          // then it is a bad order.
+          if (AR->getLoop() == OuterLoop) {
+            // We found an OuterLoop induction after InnerLoop induction. It is
+            // a bad order.
+            FoundOuterInduction = true;
+            if (FoundInnerInduction) {
+              BadOrder++;
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  return GoodOrder - BadOrder;
+}
+
+static bool isProfitabileForVectorization(unsigned InnerLoopId,
+                                          unsigned OuterLoopId,
+                                          CharMatrix &DepMatrix) {
+  // TODO: Improve this heuristic to catch more cases.
+  // If the inner loop is loop independent or doesn't carry any dependency it is
+  // profitable to move this to outer position.
+  unsigned Row = DepMatrix.size();
+  for (unsigned i = 0; i < Row; ++i) {
+    if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I')
+      return false;
+    // TODO: We need to improve this heuristic.
+    if (DepMatrix[i][OuterLoopId] != '=')
+      return false;
+  }
+  // If outer loop has dependence and inner loop is loop independent then it is
+  // profitable to interchange to enable parallelism.
+  return true;
+}
+
+bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
+                                                unsigned OuterLoopId,
+                                                CharMatrix &DepMatrix) {
+
+  // TODO: Add Better Profitibility checks.
+  // e.g
+  // 1) Construct dependency matrix and move the one with no loop carried dep
+  //    inside to enable vectorization.
+
+  // This is rough cost estimation algorithm. It counts the good and bad order
+  // of induction variables in the instruction and allows reordering if number
+  // of bad orders is more than good.
+  int Cost = 0;
+  Cost += getInstrOrderCost();
+  DEBUG(dbgs() << "Cost = " << Cost << "\n");
+  if (Cost < 0)
+    return true;
+
+  // It is not profitable as per current cache profitibility model. But check if
+  // we can move this loop outside to improve parallelism.
+  bool ImprovesPar =
+      isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
+  return ImprovesPar;
+}
+
+void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
+                                               Loop *InnerLoop) {
+  for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
+       ++I) {
+    if (*I == InnerLoop) {
+      OuterLoop->removeChildLoop(I);
+      return;
+    }
+  }
+  assert(false && "Couldn't find loop");
+}
+
+void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
+                                                Loop *OuterLoop) {
+  Loop *OuterLoopParent = OuterLoop->getParentLoop();
+  if (OuterLoopParent) {
+    // Remove the loop from its parent loop.
+    removeChildLoop(OuterLoopParent, OuterLoop);
+    removeChildLoop(OuterLoop, InnerLoop);
+    OuterLoopParent->addChildLoop(InnerLoop);
+  } else {
+    removeChildLoop(OuterLoop, InnerLoop);
+    LI->changeTopLevelLoop(OuterLoop, InnerLoop);
+  }
+
+  for (Loop::iterator I = InnerLoop->begin(), E = InnerLoop->end(); I != E; ++I)
+    OuterLoop->addChildLoop(InnerLoop->removeChildLoop(I));
+
+  InnerLoop->addChildLoop(OuterLoop);
+}
+
+bool LoopInterchangeTransform::transform() {
+
+  DEBUG(dbgs() << "transform\n");
+  bool Transformed = false;
+  Instruction *InnerIndexVar;
+
+  if (InnerLoop->getSubLoops().size() == 0) {
+    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+    DEBUG(dbgs() << "Calling Split Inner Loop\n");
+    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
+    if (!InductionPHI) {
+      DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+      return false;
+    }
+
+    if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
+    else
+      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
+
+    //
+    // Split at the place were the induction variable is
+    // incremented/decremented.
+    // TODO: This splitting logic may not work always. Fix this.
+    splitInnerLoopLatch(InnerIndexVar);
+    DEBUG(dbgs() << "splitInnerLoopLatch Done\n");
+
+    // Splits the inner loops phi nodes out into a seperate basic block.
+    splitInnerLoopHeader();
+    DEBUG(dbgs() << "splitInnerLoopHeader Done\n");
+  }
+
+  Transformed |= adjustLoopLinks();
+  if (!Transformed) {
+    DEBUG(dbgs() << "adjustLoopLinks Failed\n");
+    return false;
+  }
+
+  restructureLoops(InnerLoop, OuterLoop);
+  return true;
+}
+
+void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
+  InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
+}
+
+void LoopInterchangeTransform::splitOuterLoopLatch() {
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+  BasicBlock *OuterLatchLcssaPhiBlock = OuterLoopLatch;
+  OuterLoopLatch = SplitBlock(OuterLatchLcssaPhiBlock,
+                              OuterLoopLatch->getFirstNonPHI(), DT, LI);
+}
+
+void LoopInterchangeTransform::splitInnerLoopHeader() {
+
+  // Split the inner loop header out.
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+
+  DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
+                  "InnerLoopHeader \n");
+}
+
+/// \brief Move all instructions except the terminator from FromBB right before
+/// InsertBefore
+static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
+  auto &ToList = InsertBefore->getParent()->getInstList();
+  auto &FromList = FromBB->getInstList();
+
+  ToList.splice(InsertBefore, FromList, FromList.begin(),
+                FromBB->getTerminator());
+}
+
+void LoopInterchangeTransform::adjustOuterLoopPreheader() {
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerPreHeader = InnerLoop->getLoopPreheader();
+
+  moveBBContents(OuterLoopPreHeader, InnerPreHeader->getTerminator());
+}
+
+void LoopInterchangeTransform::adjustInnerLoopPreheader() {
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterHeader = OuterLoop->getHeader();
+
+  moveBBContents(InnerLoopPreHeader, OuterHeader->getTerminator());
+}
+
+bool LoopInterchangeTransform::adjustLoopBranches() {
+
+  DEBUG(dbgs() << "adjustLoopBranches called\n");
+  // Adjust the loop preheader
+  BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+  BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
+  BasicBlock *InnerLoopLatchPredecessor =
+      InnerLoopLatch->getUniquePredecessor();
+  BasicBlock *InnerLoopLatchSuccessor;
+  BasicBlock *OuterLoopLatchSuccessor;
+
+  BranchInst *OuterLoopLatchBI =
+      dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
+  BranchInst *InnerLoopLatchBI =
+      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
+  BranchInst *OuterLoopHeaderBI =
+      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+  BranchInst *InnerLoopHeaderBI =
+      dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
+
+  if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
+      !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
+      !InnerLoopHeaderBI)
+    return false;
+
+  BranchInst *InnerLoopLatchPredecessorBI =
+      dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
+  BranchInst *OuterLoopPredecessorBI =
+      dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
+
+  if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
+    return false;
+  BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor();
+  if (!InnerLoopHeaderSucessor)
+    return false;
+
+  // Adjust Loop Preheader and headers
+
+  unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
+  for (unsigned i = 0; i < NumSucc; ++i) {
+    if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
+      OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
+  }
+
+  NumSucc = OuterLoopHeaderBI->getNumSuccessors();
+  for (unsigned i = 0; i < NumSucc; ++i) {
+    if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
+      OuterLoopHeaderBI->setSuccessor(i, LoopExit);
+    else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
+      OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor);
+  }
+
+  BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
+  InnerLoopHeaderBI->eraseFromParent();
+
+  // -------------Adjust loop latches-----------
+  if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
+    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
+  else
+    InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
+
+  NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
+  for (unsigned i = 0; i < NumSucc; ++i) {
+    if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
+      InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
+  }
+
+  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
+    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
+  else
+    OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
+
+  if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
+    InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
+  else
+    InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
+
+  if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
+    OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
+  } else {
+    OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
+  }
+
+  return true;
+}
+void LoopInterchangeTransform::adjustLoopPreheaders() {
+
+  // We have interchanged the preheaders so we need to interchange the data in
+  // the preheader as well.
+  // This is because the content of inner preheader was previously executed
+  // inside the outer loop.
+  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+  BranchInst *InnerTermBI =
+      cast<BranchInst>(InnerLoopPreHeader->getTerminator());
+
+  BasicBlock *HeaderSplit =
+      SplitBlock(OuterLoopHeader, OuterLoopHeader->getTerminator(), DT, LI);
+  Instruction *InsPoint = HeaderSplit->getFirstNonPHI();
+  // These instructions should now be executed inside the loop.
+  // Move instruction into a new block after outer header.
+  moveBBContents(InnerLoopPreHeader, InsPoint);
+  // These instructions were not executed previously in the loop so move them to
+  // the older inner loop preheader.
+  moveBBContents(OuterLoopPreHeader, InnerTermBI);
+}
+
+bool LoopInterchangeTransform::adjustLoopLinks() {
+
+  // Adjust all branches in the inner and outer loop.
+  bool Changed = adjustLoopBranches();
+  if (Changed)
+    adjustLoopPreheaders();
+  return Changed;
+}
+
+char LoopInterchange::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
+                      "Interchanges loops for cache reuse", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+
+INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
+                    "Interchanges loops for cache reuse", false, false)
+
+Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index fdf7e3b..ed103e6 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -30,7 +31,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -160,7 +160,6 @@ namespace {
     AliasAnalysis *AA;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    const DataLayout *DL;
     TargetLibraryInfo *TLI;
     DominatorTree *DT;
 
@@ -367,10 +366,8 @@ namespace {
     struct DAGRootTracker {
       DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
                      ScalarEvolution *SE, AliasAnalysis *AA,
-                     TargetLibraryInfo *TLI, const DataLayout *DL)
-        : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI),
-          DL(DL), IV(IV) {
-      }
+                     TargetLibraryInfo *TLI)
+          : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), IV(IV) {}
 
       /// Stage 1: Find all the DAG roots for the induction variable.
       bool findRoots();
@@ -416,7 +413,6 @@ namespace {
       ScalarEvolution *SE;
       AliasAnalysis *AA;
       TargetLibraryInfo *TLI;
-      const DataLayout *DL;
 
       // The loop induction variable.
       Instruction *IV;
@@ -1131,7 +1127,7 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
         // needed because otherwise isSafeToSpeculativelyExecute returns
         // false on PHI nodes.
         if (!isa<PHINode>(I) && !isSimpleLoadStore(I) &&
-            !isSafeToSpeculativelyExecute(I, DL))
+            !isSafeToSpeculativelyExecute(I))
           // Intervening instructions cause side effects.
           FutureSideEffects = true;
       }
@@ -1161,11 +1157,10 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
       // side effects, and this instruction might also, then we can't reorder
       // them, and this matching fails. As an exception, we allow the alias
       // set tracker to handle regular (simple) load/store dependencies.
-      if (FutureSideEffects &&
-            ((!isSimpleLoadStore(BaseInst) &&
-              !isSafeToSpeculativelyExecute(BaseInst, DL)) ||
-             (!isSimpleLoadStore(RootInst) &&
-              !isSafeToSpeculativelyExecute(RootInst, DL)))) {
+      if (FutureSideEffects && ((!isSimpleLoadStore(BaseInst) &&
+                                 !isSafeToSpeculativelyExecute(BaseInst)) ||
+                                (!isSimpleLoadStore(RootInst) &&
+                                 !isSafeToSpeculativelyExecute(RootInst)))) {
         DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
                         " vs. " << *RootInst <<
                         " (side effects prevent reordering)\n");
@@ -1272,6 +1267,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
 
     ++J;
   }
+  const DataLayout &DL = Header->getModule()->getDataLayout();
 
   // We need to create a new induction variable for each different BaseInst.
   for (auto &DRS : RootSets) {
@@ -1284,7 +1280,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
                          SE->getConstant(RealIVSCEV->getType(), 1),
                          L, SCEV::FlagAnyWrap));
     { // Limit the lifetime of SCEVExpander.
-      SCEVExpander Expander(*SE, "reroll");
+      SCEVExpander Expander(*SE, DL, "reroll");
       Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin());
 
       for (auto &KV : Uses) {
@@ -1324,7 +1320,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
     }
   }
 
-  SimplifyInstructionsInBlock(Header, DL, TLI);
+  SimplifyInstructionsInBlock(Header, TLI);
   DeleteDeadPHIs(Header, TLI);
 }
 
@@ -1448,7 +1444,7 @@ void LoopReroll::ReductionTracker::replaceSelected() {
 bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
                         const SCEV *IterCount,
                         ReductionTracker &Reductions) {
-  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DL);
+  DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI);
 
   if (!DAGRoots.findRoots())
     return false;
@@ -1477,8 +1473,6 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolution>();
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   BasicBlock *Header = L->getHeader();
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 4d12349..a675e12 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -24,8 +24,10 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -412,6 +414,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
     ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
 
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
   // For the rest of the instructions, either hoist to the OrigPreheader if
   // possible or create a clone in the OldPreHeader if not.
   TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
@@ -442,8 +446,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // With the operands remapped, see if the instruction constant folds or is
     // otherwise simplifyable.  This commonly occurs because the entry from PHI
     // nodes allows icmps and other instructions to fold.
-    // FIXME: Provide DL, TLI, DT, AC to SimplifyInstruction.
-    Value *V = SimplifyInstruction(C);
+    // FIXME: Provide TLI, DT, AC to SimplifyInstruction.
+    Value *V = SimplifyInstruction(C, DL);
     if (V && LI->replacementPreservesLCSSAForm(C, V)) {
       // If so, then delete the temporary instruction and stick the folded value
       // in the map.
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 318065e..8445d5f 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -68,6 +68,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -3825,7 +3826,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           if (C->getValue()->isNegative() !=
                 (NewF.BaseOffset < 0) &&
               (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale))
-                .ule(abs64(NewF.BaseOffset)))
+                .ule(std::abs(NewF.BaseOffset)))
             continue;
 
         // OK, looks good.
@@ -3856,7 +3857,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                J != JE; ++J)
             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
               if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt(
-                   abs64(NewF.BaseOffset)) &&
+                   std::abs(NewF.BaseOffset)) &&
                   (C->getValue()->getValue() +
                    NewF.BaseOffset).countTrailingZeros() >=
                    countTrailingZeros<uint64_t>(NewF.BaseOffset))
@@ -4823,7 +4824,8 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
   // we can remove them after we are done working.
   SmallVector<WeakVH, 16> DeadInsts;
 
-  SCEVExpander Rewriter(SE, "lsr");
+  SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
+                        "lsr");
 #ifndef NDEBUG
   Rewriter.setDebugType(DEBUG_TYPE);
 #endif
@@ -5093,7 +5095,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   Changed |= DeleteDeadPHIs(L->getHeader());
   if (EnablePhiElim && L->isLoopSimplifyForm()) {
     SmallVector<WeakVH, 16> DeadInsts;
-    SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr");
+    const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), DL, "lsr");
 #ifndef NDEBUG
     Rewriter.setDebugType(DEBUG_TYPE);
 #endif
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 924be16..600cbde 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -23,14 +24,13 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include <climits>
 
 using namespace llvm;
@@ -259,6 +259,7 @@ static bool isLoadFromConstantInitializer(Value *V) {
   return false;
 }
 
+namespace {
 struct FindConstantPointers {
   bool LoadCanBeConstantFolded;
   bool IndexIsConstant;
@@ -356,11 +357,12 @@ class UnrollAnalyzer : public InstVisitor<UnrollAnalyzer, bool> {
       if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
         RHS = SimpleRHS;
     Value *SimpleV = nullptr;
+    const DataLayout &DL = I.getModule()->getDataLayout();
     if (auto FI = dyn_cast<FPMathOperator>(&I))
       SimpleV =
-          SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags());
+          SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
     else
-      SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS);
+      SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
 
     if (SimpleV && CountedInstructions.insert(&I).second)
       NumberOfOptimizedInstructions += TTI.getUserCost(&I);
@@ -540,6 +542,7 @@ public:
     return NumberOfOptimizedInstructions;
   }
 };
+} // namespace
 
 // Complete loop unrolling can make some loads constant, and we need to know if
 // that would expose any further optimization opportunities.
@@ -619,6 +622,11 @@ static bool HasUnrollDisablePragma(const Loop *L) {
   return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
 }
 
+// Returns true if the loop has an runtime unroll(disable) pragma.
+static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
+  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
+}
+
 // If loop has an unroll_count pragma return the (necessarily
 // positive) value from the pragma.  Otherwise return 0.
 static unsigned UnrollCountPragmaValue(const Loop *L) {
@@ -807,6 +815,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Reduce count based on the type of unrolling and the threshold values.
   unsigned OriginalCount = Count;
   bool AllowRuntime = UserRuntime ? CurrentRuntime : UP.Runtime;
+  if (HasRuntimeUnrollDisablePragma(L)) {
+    AllowRuntime = false;
+  }
   if (Unrolling == Partial) {
     bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
     if (!AllowPartial && !CountSetExplicitly) {
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 987dc96..988d2af 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -1082,6 +1083,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
 /// pass.
 ///
 void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
   while (!Worklist.empty()) {
     Instruction *I = Worklist.back();
     Worklist.pop_back();
@@ -1104,7 +1106,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     // See if instruction simplification can hack this up.  This is common for
     // things like "select false, X, Y" after unswitching made the condition be
     // 'false'.  TODO: update the domtree properly so we can pass it here.
-    if (Value *V = SimplifyInstruction(I))
+    if (Value *V = SimplifyInstruction(I, DL))
       if (LI->replacementPreservesLCSSAForm(I, V)) {
         ReplaceUsesOfWith(I, V, Worklist, L, LPM);
         continue;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 006b885..2b5a078 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -28,7 +29,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
@@ -41,7 +41,8 @@ STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 
 static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
-                                  bool &VariableIdxFound, const DataLayout &TD){
+                                  bool &VariableIdxFound,
+                                  const DataLayout &DL) {
   // Skip over the first indices.
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (unsigned i = 1; i != Idx; ++i, ++GTI)
@@ -57,13 +58,13 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
 
     // Handle struct indices, which add their field offset to the pointer.
     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-      Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+      Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
       continue;
     }
 
     // Otherwise, we have a sequential type like an array or vector.  Multiply
     // the index by the ElementSize.
-    uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+    uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
     Offset += Size*OpC->getSExtValue();
   }
 
@@ -74,7 +75,7 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
 /// constant offset, and return that constant offset.  For example, Ptr1 might
 /// be &A[42], and Ptr2 might be &A[40].  In this case offset would be -8.
 static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
-                            const DataLayout &TD) {
+                            const DataLayout &DL) {
   Ptr1 = Ptr1->stripPointerCasts();
   Ptr2 = Ptr2->stripPointerCasts();
 
@@ -92,12 +93,12 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   // If one pointer is a GEP and the other isn't, then see if the GEP is a
   // constant offset from the base, as in "P" and "gep P, 1".
   if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
-    Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD);
+    Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL);
     return !VariableIdxFound;
   }
 
   if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
-    Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD);
+    Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL);
     return !VariableIdxFound;
   }
 
@@ -115,8 +116,8 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
     if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
       break;
 
-  int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD);
-  int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD);
+  int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL);
+  int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL);
   if (VariableIdxFound) return false;
 
   Offset = Offset2-Offset1;
@@ -150,12 +151,11 @@ struct MemsetRange {
   /// TheStores - The actual stores that make up this range.
   SmallVector<Instruction*, 16> TheStores;
 
-  bool isProfitableToUseMemset(const DataLayout &TD) const;
-
+  bool isProfitableToUseMemset(const DataLayout &DL) const;
 };
 } // end anon namespace
 
-bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const {
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // If we found more than 4 stores to merge or 16 bytes, use memset.
   if (TheStores.size() >= 4 || End-Start >= 16) return true;
 
@@ -183,7 +183,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const {
   // size. If so, check to see whether we will end up actually reducing the
   // number of stores used.
   unsigned Bytes = unsigned(End-Start);
-  unsigned MaxIntSize = TD.getLargestLegalIntTypeSize();
+  unsigned MaxIntSize = DL.getLargestLegalIntTypeSize();
   if (MaxIntSize == 0)
     MaxIntSize = 1;
   unsigned NumPointerStores = Bytes / MaxIntSize;
@@ -314,14 +314,12 @@ namespace {
   class MemCpyOpt : public FunctionPass {
     MemoryDependenceAnalysis *MD;
     TargetLibraryInfo *TLI;
-    const DataLayout *DL;
   public:
     static char ID; // Pass identification, replacement for typeid
     MemCpyOpt() : FunctionPass(ID) {
       initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
       MD = nullptr;
       TLI = nullptr;
-      DL = nullptr;
     }
 
     bool runOnFunction(Function &F) override;
@@ -377,13 +375,13 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
 /// attempts to merge them together into a memcpy/memset.
 Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
                                              Value *StartPtr, Value *ByteVal) {
-  if (!DL) return nullptr;
+  const DataLayout &DL = StartInst->getModule()->getDataLayout();
 
   // Okay, so we now have a single store that can be splatable.  Scan to find
   // all subsequent stores of the same value to offset from the same pointer.
   // Join these together into ranges, so we can decide whether contiguous blocks
   // are stored.
-  MemsetRanges Ranges(*DL);
+  MemsetRanges Ranges(DL);
 
   BasicBlock::iterator BI = StartInst;
   for (++BI; !isa<TerminatorInst>(BI); ++BI) {
@@ -406,8 +404,8 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 
       // Check to see if this store is to a constant offset from the start ptr.
       int64_t Offset;
-      if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(),
-                           Offset, *DL))
+      if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
+                           DL))
         break;
 
       Ranges.addStore(Offset, NextStore);
@@ -420,7 +418,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 
       // Check to see if this store is to a constant offset from the start ptr.
       int64_t Offset;
-      if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *DL))
+      if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL))
         break;
 
       Ranges.addMemSet(Offset, MSI);
@@ -452,7 +450,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     if (Range.TheStores.size() == 1) continue;
 
     // If it is profitable to lower this range to memset, do so now.
-    if (!Range.isProfitableToUseMemset(*DL))
+    if (!Range.isProfitableToUseMemset(DL))
       continue;
 
     // Otherwise, we do want to transform this!  Create a new memset.
@@ -464,7 +462,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     if (Alignment == 0) {
       Type *EltType =
         cast<PointerType>(StartPtr->getType())->getElementType();
-      Alignment = DL->getABITypeAlignment(EltType);
+      Alignment = DL.getABITypeAlignment(EltType);
     }
 
     AMemSet =
@@ -494,8 +492,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple()) return false;
-
-  if (!DL) return false;
+  const DataLayout &DL = SI->getModule()->getDataLayout();
 
   // Detect cases where we're performing call slot forwarding, but
   // happen to be using a load-store pair to implement it, rather than
@@ -525,16 +522,16 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       if (C) {
         unsigned storeAlign = SI->getAlignment();
         if (!storeAlign)
-          storeAlign = DL->getABITypeAlignment(SI->getOperand(0)->getType());
+          storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
         unsigned loadAlign = LI->getAlignment();
         if (!loadAlign)
-          loadAlign = DL->getABITypeAlignment(LI->getType());
+          loadAlign = DL.getABITypeAlignment(LI->getType());
 
-        bool changed = performCallSlotOptzn(LI,
-                        SI->getPointerOperand()->stripPointerCasts(),
-                        LI->getPointerOperand()->stripPointerCasts(),
-                        DL->getTypeStoreSize(SI->getOperand(0)->getType()),
-                        std::min(storeAlign, loadAlign), C);
+        bool changed = performCallSlotOptzn(
+            LI, SI->getPointerOperand()->stripPointerCasts(),
+            LI->getPointerOperand()->stripPointerCasts(),
+            DL.getTypeStoreSize(SI->getOperand(0)->getType()),
+            std::min(storeAlign, loadAlign), C);
         if (changed) {
           MD->removeInstruction(SI);
           SI->eraseFromParent();
@@ -606,15 +603,13 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   if (!srcAlloca)
     return false;
 
-  // Check that all of src is copied to dest.
-  if (!DL) return false;
-
   ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
   if (!srcArraySize)
     return false;
 
-  uint64_t srcSize = DL->getTypeAllocSize(srcAlloca->getAllocatedType()) *
-    srcArraySize->getZExtValue();
+  const DataLayout &DL = cpy->getModule()->getDataLayout();
+  uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+                     srcArraySize->getZExtValue();
 
   if (cpyLen < srcSize)
     return false;
@@ -628,8 +623,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
     if (!destArraySize)
       return false;
 
-    uint64_t destSize = DL->getTypeAllocSize(A->getAllocatedType()) *
-      destArraySize->getZExtValue();
+    uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) *
+                        destArraySize->getZExtValue();
 
     if (destSize < srcSize)
       return false;
@@ -648,7 +643,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
         return false;
       }
 
-      uint64_t destSize = DL->getTypeAllocSize(StructTy);
+      uint64_t destSize = DL.getTypeAllocSize(StructTy);
       if (destSize < srcSize)
         return false;
     }
@@ -659,7 +654,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // Check that dest points to memory that is at least as aligned as src.
   unsigned srcAlign = srcAlloca->getAlignment();
   if (!srcAlign)
-    srcAlign = DL->getABITypeAlignment(srcAlloca->getAllocatedType());
+    srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType());
   bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
   // If dest is not aligned enough and we can't increase its alignment then
   // bail out.
@@ -959,12 +954,11 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
 
 /// processByValArgument - This is called on every byval argument in call sites.
 bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
-  if (!DL) return false;
-
+  const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
   // Find out what feeds this byval argument.
   Value *ByValArg = CS.getArgument(ArgNo);
   Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
-  uint64_t ByValSize = DL->getTypeAllocSize(ByValTy);
+  uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
   MemDepResult DepInfo =
     MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize),
                                  true, CS.getInstruction(),
@@ -997,8 +991,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
           *CS->getParent()->getParent());
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   if (MDep->getAlignment() < ByValAlign &&
-      getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &AC,
-                                 CS.getInstruction(), &DT) < ByValAlign)
+      getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
+                                 CS.getInstruction(), &AC, &DT) < ByValAlign)
     return false;
 
   // Verify that the copied-from memory doesn't change in between the memcpy and
@@ -1077,8 +1071,6 @@ bool MemCpyOpt::runOnFunction(Function &F) {
 
   bool MadeChange = false;
   MD = &getAnalysis<MemoryDependenceAnalysis>();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 
   // If we don't have at least memset and memcpy, there is little point of doing
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 8fad63f..73f4296 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -81,12 +81,13 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <vector>
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 98016b4..307cc73 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -321,10 +321,8 @@ unsigned Reassociate::getRank(Value *V) {
 
   // If this is a not or neg instruction, do not count it for rank.  This
   // assures us that X and ~X will have the same rank.
-  Type *Ty = V->getType();
-  if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) ||
-      (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
-       !BinaryOperator::isFNeg(I)))
+  if  (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+       !BinaryOperator::isFNeg(I))
     ++Rank;
 
   DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
@@ -351,7 +349,7 @@ void Reassociate::canonicalizeOperands(Instruction *I) {
 
 static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
                                  Instruction *InsertBefore, Value *FlagsOp) {
-  if (S1->getType()->isIntegerTy())
+  if (S1->getType()->isIntOrIntVectorTy())
     return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
   else {
     BinaryOperator *Res =
@@ -363,7 +361,7 @@ static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
 
 static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
                                  Instruction *InsertBefore, Value *FlagsOp) {
-  if (S1->getType()->isIntegerTy())
+  if (S1->getType()->isIntOrIntVectorTy())
     return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
   else {
     BinaryOperator *Res =
@@ -375,7 +373,7 @@ static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
 
 static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
                                  Instruction *InsertBefore, Value *FlagsOp) {
-  if (S1->getType()->isIntegerTy())
+  if (S1->getType()->isIntOrIntVectorTy())
     return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
   else {
     BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
@@ -388,8 +386,8 @@ static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
 ///
 static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
   Type *Ty = Neg->getType();
-  Constant *NegOne = Ty->isIntegerTy() ? ConstantInt::getAllOnesValue(Ty)
-                                       : ConstantFP::get(Ty, -1.0);
+  Constant *NegOne = Ty->isIntOrIntVectorTy() ?
+    ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
 
   BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
   Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
@@ -872,7 +870,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
       Constant *Undef = UndefValue::get(I->getType());
       NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
                                      Undef, Undef, "", I);
-      if (NewOp->getType()->isFloatingPointTy())
+      if (NewOp->getType()->isFPOrFPVectorTy())
         NewOp->setFastMathFlags(I->getFastMathFlags());
     } else {
       NewOp = NodesToRewrite.pop_back_val();
@@ -1520,8 +1518,8 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
 
       // Insert a new multiply.
       Type *Ty = TheOp->getType();
-      Constant *C = Ty->isIntegerTy() ? ConstantInt::get(Ty, NumFound)
-                                      : ConstantFP::get(Ty, NumFound);
+      Constant *C = Ty->isIntOrIntVectorTy() ?
+        ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
       Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
 
       // Now that we have inserted a multiply, optimize it. This allows us to
@@ -1661,7 +1659,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
     // from an expression will drop a use of maxocc, and this can cause
     // RemoveFactorFromExpression on successive values to behave differently.
     Instruction *DummyInst =
-        I->getType()->isIntegerTy()
+        I->getType()->isIntOrIntVectorTy()
             ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
             : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
 
@@ -1792,7 +1790,7 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder,
 
   Value *LHS = Ops.pop_back_val();
   do {
-    if (LHS->getType()->isIntegerTy())
+    if (LHS->getType()->isIntOrIntVectorTy())
       LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
     else
       LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
@@ -2090,8 +2088,9 @@ void Reassociate::OptimizeInst(Instruction *I) {
   if (I->isCommutative())
     canonicalizeOperands(I);
 
-  // Don't optimize vector instructions.
-  if (I->getType()->isVectorTy())
+  // TODO: We should optimize vector Xor instructions, but they are
+  // currently unsupported.
+  if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor)
     return;
 
   // Don't optimize floating point instructions that don't have unsafe algebra.
@@ -2170,9 +2169,6 @@ void Reassociate::OptimizeInst(Instruction *I) {
 }
 
 void Reassociate::ReassociateExpression(BinaryOperator *I) {
-  assert(!I->getType()->isVectorTy() &&
-         "Reassociation of vector instructions is not supported.");
-
   // First, walk the expression tree, linearizing the tree, collecting the
   // operand information.
   SmallVector<RepeatedValue, 8> Tree;
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index ca9ab54..f5d21ff 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -548,9 +548,6 @@ public:
   }
   PhiState(Value *b) : status(Base), base(b) {}
   PhiState() : status(Unknown), base(nullptr) {}
-  PhiState(const PhiState &other) : status(other.status), base(other.base) {
-    assert(status != Base || base);
-  }
 
   Status getStatus() const { return status; }
   Value *getBase() const { return base; }
@@ -684,12 +681,19 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache,
   states[def] = PhiState();
   // Recursively fill in all phis & selects reachable from the initial one
   // for which we don't already know a definite base value for
-  // PERF: Yes, this is as horribly inefficient as it looks.
+  // TODO: This should be rewritten with a worklist
   bool done = false;
   while (!done) {
     done = true;
+    // Since we're adding elements to 'states' as we run, we can't keep
+    // iterators into the set.
+    SmallVector<Value*, 16> Keys;
+    Keys.reserve(states.size());
     for (auto Pair : states) {
-      Value *v = Pair.first;
+      Value *V = Pair.first;
+      Keys.push_back(V);
+    }
+    for (Value *v : Keys) {
       assert(!isKnownBaseResult(v) && "why did it get added?");
       if (PHINode *phi = dyn_cast<PHINode>(v)) {
         assert(phi->getNumIncomingValues() > 0 &&
@@ -730,10 +734,12 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache,
   // have reached conflict state.  The current version seems too conservative.
 
   bool progress = true;
-  size_t oldSize = 0;
   while (progress) {
-    oldSize = states.size();
+#ifndef NDEBUG
+    size_t oldSize = states.size();
+#endif
     progress = false;
+    // We're only changing keys in this loop, thus safe to keep iterators
     for (auto Pair : states) {
       MeetPhiStates calculateMeet(states);
       Value *v = Pair.first;
@@ -768,46 +774,58 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache,
   }
 
   // Insert Phis for all conflicts
+  // We want to keep naming deterministic in the loop that follows, so
+  // sort the keys before iteration.  This is useful in allowing us to
+  // write stable tests. Note that there is no invalidation issue here.
+  SmallVector<Value*, 16> Keys;
+  Keys.reserve(states.size());
   for (auto Pair : states) {
-    Instruction *v = cast<Instruction>(Pair.first);
-    PhiState state = Pair.second;
+    Value *V = Pair.first;
+    Keys.push_back(V);
+  }
+  std::sort(Keys.begin(), Keys.end(), order_by_name);
+  // TODO: adjust naming patterns to avoid this order of iteration dependency
+  for (Value *V : Keys) {
+    Instruction *v = cast<Instruction>(V);
+    PhiState state = states[V];
     assert(!isKnownBaseResult(v) && "why did it get added?");
     assert(!state.isUnknown() && "Optimistic algorithm didn't complete!");
-    if (state.isConflict()) {
-      if (isa<PHINode>(v)) {
-        int num_preds =
-            std::distance(pred_begin(v->getParent()), pred_end(v->getParent()));
-        assert(num_preds > 0 && "how did we reach here");
-        PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v);
-        NewInsertedDefs.insert(phi);
-        // Add metadata marking this as a base value
-        auto *const_1 = ConstantInt::get(
-            Type::getInt32Ty(
-                v->getParent()->getParent()->getParent()->getContext()),
-            1);
-        auto MDConst = ConstantAsMetadata::get(const_1);
-        MDNode *md = MDNode::get(
-            v->getParent()->getParent()->getParent()->getContext(), MDConst);
-        phi->setMetadata("is_base_value", md);
-        states[v] = PhiState(PhiState::Conflict, phi);
-      } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) {
-        // The undef will be replaced later
-        UndefValue *undef = UndefValue::get(sel->getType());
-        SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef,
-                                                 undef, "base_select", sel);
-        NewInsertedDefs.insert(basesel);
-        // Add metadata marking this as a base value
-        auto *const_1 = ConstantInt::get(
-            Type::getInt32Ty(
-                v->getParent()->getParent()->getParent()->getContext()),
-            1);
-        auto MDConst = ConstantAsMetadata::get(const_1);
-        MDNode *md = MDNode::get(
-            v->getParent()->getParent()->getParent()->getContext(), MDConst);
-        basesel->setMetadata("is_base_value", md);
-        states[v] = PhiState(PhiState::Conflict, basesel);
-      } else
-        llvm_unreachable("unknown conflict type");
+    if (!state.isConflict())
+      continue;
+    
+    if (isa<PHINode>(v)) {
+      int num_preds =
+          std::distance(pred_begin(v->getParent()), pred_end(v->getParent()));
+      assert(num_preds > 0 && "how did we reach here");
+      PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v);
+      NewInsertedDefs.insert(phi);
+      // Add metadata marking this as a base value
+      auto *const_1 = ConstantInt::get(
+          Type::getInt32Ty(
+              v->getParent()->getParent()->getParent()->getContext()),
+          1);
+      auto MDConst = ConstantAsMetadata::get(const_1);
+      MDNode *md = MDNode::get(
+          v->getParent()->getParent()->getParent()->getContext(), MDConst);
+      phi->setMetadata("is_base_value", md);
+      states[v] = PhiState(PhiState::Conflict, phi);
+    } else {
+      SelectInst *sel = cast<SelectInst>(v);
+      // The undef will be replaced later
+      UndefValue *undef = UndefValue::get(sel->getType());
+      SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef,
+                                               undef, "base_select", sel);
+      NewInsertedDefs.insert(basesel);
+      // Add metadata marking this as a base value
+      auto *const_1 = ConstantInt::get(
+          Type::getInt32Ty(
+              v->getParent()->getParent()->getParent()->getContext()),
+          1);
+      auto MDConst = ConstantAsMetadata::get(const_1);
+      MDNode *md = MDNode::get(
+          v->getParent()->getParent()->getParent()->getContext(), MDConst);
+      basesel->setMetadata("is_base_value", md);
+      states[v] = PhiState(PhiState::Conflict, basesel);
     }
   }
 
@@ -818,97 +836,98 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache,
 
     assert(!isKnownBaseResult(v) && "why did it get added?");
     assert(!state.isUnknown() && "Optimistic algorithm didn't complete!");
-    if (state.isConflict()) {
-      if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) {
-        PHINode *phi = cast<PHINode>(v);
-        unsigned NumPHIValues = phi->getNumIncomingValues();
-        for (unsigned i = 0; i < NumPHIValues; i++) {
-          Value *InVal = phi->getIncomingValue(i);
-          BasicBlock *InBB = phi->getIncomingBlock(i);
-
-          // If we've already seen InBB, add the same incoming value
-          // we added for it earlier.  The IR verifier requires phi
-          // nodes with multiple entries from the same basic block
-          // to have the same incoming value for each of those
-          // entries.  If we don't do this check here and basephi
-          // has a different type than base, we'll end up adding two
-          // bitcasts (and hence two distinct values) as incoming
-          // values for the same basic block.
-
-          int blockIndex = basephi->getBasicBlockIndex(InBB);
-          if (blockIndex != -1) {
-            Value *oldBase = basephi->getIncomingValue(blockIndex);
-            basephi->addIncoming(oldBase, InBB);
+    if (!state.isConflict())
+      continue;
+    
+    if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) {
+      PHINode *phi = cast<PHINode>(v);
+      unsigned NumPHIValues = phi->getNumIncomingValues();
+      for (unsigned i = 0; i < NumPHIValues; i++) {
+        Value *InVal = phi->getIncomingValue(i);
+        BasicBlock *InBB = phi->getIncomingBlock(i);
+
+        // If we've already seen InBB, add the same incoming value
+        // we added for it earlier.  The IR verifier requires phi
+        // nodes with multiple entries from the same basic block
+        // to have the same incoming value for each of those
+        // entries.  If we don't do this check here and basephi
+        // has a different type than base, we'll end up adding two
+        // bitcasts (and hence two distinct values) as incoming
+        // values for the same basic block.
+
+        int blockIndex = basephi->getBasicBlockIndex(InBB);
+        if (blockIndex != -1) {
+          Value *oldBase = basephi->getIncomingValue(blockIndex);
+          basephi->addIncoming(oldBase, InBB);
 #ifndef NDEBUG
-            Value *base = findBaseOrBDV(InVal, cache);
-            if (!isKnownBaseResult(base)) {
-              // Either conflict or base.
-              assert(states.count(base));
-              base = states[base].getBase();
-              assert(base != nullptr && "unknown PhiState!");
-              assert(NewInsertedDefs.count(base) &&
-                     "should have already added this in a prev. iteration!");
-            }
-
-            // In essense this assert states: the only way two
-            // values incoming from the same basic block may be
-            // different is by being different bitcasts of the same
-            // value.  A cleanup that remains TODO is changing
-            // findBaseOrBDV to return an llvm::Value of the correct
-            // type (and still remain pure).  This will remove the
-            // need to add bitcasts.
-            assert(base->stripPointerCasts() == oldBase->stripPointerCasts() &&
-                   "sanity -- findBaseOrBDV should be pure!");
-#endif
-            continue;
-          }
-
-          // Find either the defining value for the PHI or the normal base for
-          // a non-phi node
           Value *base = findBaseOrBDV(InVal, cache);
           if (!isKnownBaseResult(base)) {
             // Either conflict or base.
             assert(states.count(base));
             base = states[base].getBase();
             assert(base != nullptr && "unknown PhiState!");
+            assert(NewInsertedDefs.count(base) &&
+                   "should have already added this in a prev. iteration!");
           }
-          assert(base && "can't be null");
-          // Must use original input BB since base may not be Instruction
-          // The cast is needed since base traversal may strip away bitcasts
-          if (base->getType() != basephi->getType()) {
-            base = new BitCastInst(base, basephi->getType(), "cast",
-                                   InBB->getTerminator());
-            NewInsertedDefs.insert(base);
-          }
-          basephi->addIncoming(base, InBB);
+
+          // In essense this assert states: the only way two
+          // values incoming from the same basic block may be
+          // different is by being different bitcasts of the same
+          // value.  A cleanup that remains TODO is changing
+          // findBaseOrBDV to return an llvm::Value of the correct
+          // type (and still remain pure).  This will remove the
+          // need to add bitcasts.
+          assert(base->stripPointerCasts() == oldBase->stripPointerCasts() &&
+                 "sanity -- findBaseOrBDV should be pure!");
+#endif
+          continue;
         }
-        assert(basephi->getNumIncomingValues() == NumPHIValues);
-      } else if (SelectInst *basesel = dyn_cast<SelectInst>(state.getBase())) {
-        SelectInst *sel = cast<SelectInst>(v);
-        // Operand 1 & 2 are true, false path respectively. TODO: refactor to
-        // something more safe and less hacky.
-        for (int i = 1; i <= 2; i++) {
-          Value *InVal = sel->getOperand(i);
-          // Find either the defining value for the PHI or the normal base for
-          // a non-phi node
-          Value *base = findBaseOrBDV(InVal, cache);
-          if (!isKnownBaseResult(base)) {
-            // Either conflict or base.
-            assert(states.count(base));
-            base = states[base].getBase();
-            assert(base != nullptr && "unknown PhiState!");
-          }
-          assert(base && "can't be null");
-          // Must use original input BB since base may not be Instruction
-          // The cast is needed since base traversal may strip away bitcasts
-          if (base->getType() != basesel->getType()) {
-            base = new BitCastInst(base, basesel->getType(), "cast", basesel);
-            NewInsertedDefs.insert(base);
-          }
-          basesel->setOperand(i, base);
+
+        // Find either the defining value for the PHI or the normal base for
+        // a non-phi node
+        Value *base = findBaseOrBDV(InVal, cache);
+        if (!isKnownBaseResult(base)) {
+          // Either conflict or base.
+          assert(states.count(base));
+          base = states[base].getBase();
+          assert(base != nullptr && "unknown PhiState!");
         }
-      } else
-        llvm_unreachable("unexpected conflict type");
+        assert(base && "can't be null");
+        // Must use original input BB since base may not be Instruction
+        // The cast is needed since base traversal may strip away bitcasts
+        if (base->getType() != basephi->getType()) {
+          base = new BitCastInst(base, basephi->getType(), "cast",
+                                 InBB->getTerminator());
+          NewInsertedDefs.insert(base);
+        }
+        basephi->addIncoming(base, InBB);
+      }
+      assert(basephi->getNumIncomingValues() == NumPHIValues);
+    } else {
+      SelectInst *basesel = cast<SelectInst>(state.getBase());
+      SelectInst *sel = cast<SelectInst>(v);
+      // Operand 1 & 2 are true, false path respectively. TODO: refactor to
+      // something more safe and less hacky.
+      for (int i = 1; i <= 2; i++) {
+        Value *InVal = sel->getOperand(i);
+        // Find either the defining value for the PHI or the normal base for
+        // a non-phi node
+        Value *base = findBaseOrBDV(InVal, cache);
+        if (!isKnownBaseResult(base)) {
+          // Either conflict or base.
+          assert(states.count(base));
+          base = states[base].getBase();
+          assert(base != nullptr && "unknown PhiState!");
+        }
+        assert(base && "can't be null");
+        // Must use original input BB since base may not be Instruction
+        // The cast is needed since base traversal may strip away bitcasts
+        if (base->getType() != basesel->getType()) {
+          base = new BitCastInst(base, basesel->getType(), "cast", basesel);
+          NewInsertedDefs.insert(base);
+        }
+        basesel->setOperand(i, base);
+      }
     }
   }
 
@@ -964,7 +983,13 @@ static void findBasePointers(const StatepointLiveSetTy &live,
                              DenseMap<llvm::Value *, llvm::Value *> &PointerToBase,
                              DominatorTree *DT, DefiningValueMapTy &DVCache,
                              DenseSet<llvm::Value *> &NewInsertedDefs) {
-  for (Value *ptr : live) {
+  // For the naming of values inserted to be deterministic - which makes for
+  // much cleaner and more stable tests - we need to assign an order to the
+  // live values.  DenseSets do not provide a deterministic order across runs.
+  SmallVector<Value*, 64> Temp;
+  Temp.insert(Temp.end(), live.begin(), live.end());
+  std::sort(Temp.begin(), Temp.end(), order_by_name);
+  for (Value *ptr : Temp) {
     Value *base = findBasePointer(ptr, DVCache, NewInsertedDefs);
     assert(base && "failed to find base pointer");
     PointerToBase[ptr] = base;
@@ -993,10 +1018,19 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
   findBasePointers(result.liveset, PointerToBase, &DT, DVCache, NewInsertedDefs);
 
   if (PrintBasePointers) {
+    // Note: Need to print these in a stable order since this is checked in
+    // some tests.
     errs() << "Base Pairs (w/o Relocation):\n";
+    SmallVector<Value*, 64> Temp;
+    Temp.reserve(PointerToBase.size());
     for (auto Pair : PointerToBase) {
-      errs() << " derived %" << Pair.first->getName() << " base %"
-             << Pair.second->getName() << "\n";
+      Temp.push_back(Pair.first);
+    }
+    std::sort(Temp.begin(), Temp.end(), order_by_name);
+    for (Value *Ptr : Temp) {
+      Value *Base = PointerToBase[Ptr];
+      errs() << " derived %" << Ptr->getName() << " base %"
+             << Base->getName() << "\n";
     }
   }
 
@@ -1131,11 +1165,11 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) {
 ///   statepointToken - statepoint instruction to which relocates should be
 ///   bound.
 ///   Builder - Llvm IR builder to be used to construct new calls.
-void CreateGCRelocates(ArrayRef<llvm::Value *> liveVariables,
-                       const int liveStart,
-                       ArrayRef<llvm::Value *> basePtrs,
-                       Instruction *statepointToken, IRBuilder<> Builder) {
-
+static void CreateGCRelocates(ArrayRef<llvm::Value *> liveVariables,
+                              const int liveStart,
+                              ArrayRef<llvm::Value *> basePtrs,
+                              Instruction *statepointToken,
+                              IRBuilder<> Builder) {
   SmallVector<Instruction *, 64> NewDefs;
   NewDefs.reserve(liveVariables.size());
 
@@ -1559,8 +1593,18 @@ static void relocationViaAlloca(
     // store must be inserted after load, otherwise store will be in alloca's
     // use list and an extra load will be inserted before it
     StoreInst *store = new StoreInst(def, alloca);
-    if (isa<Instruction>(def)) {
-      store->insertAfter(cast<Instruction>(def));
+    if (Instruction *inst = dyn_cast<Instruction>(def)) {
+      if (InvokeInst *invoke = dyn_cast<InvokeInst>(inst)) {
+        // InvokeInst is a TerminatorInst so the store need to be inserted
+        // into its normal destination block.
+        BasicBlock *normalDest = invoke->getNormalDest();
+        store->insertBefore(normalDest->getFirstNonPHI());
+      } else {
+        assert(!inst->isTerminator() &&
+               "The only TerminatorInst that can produce a value is "
+               "InvokeInst which is handled above.");
+         store->insertAfter(inst);
+      }
     } else {
       assert((isa<Argument>(def) || isa<GlobalVariable>(def) ||
               (isa<Constant>(def) && cast<Constant>(def)->isNullValue())) &&
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 05b9608..875a007 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -35,7 +36,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -154,7 +154,7 @@ namespace {
 /// Constant Propagation.
 ///
 class SCCPSolver : public InstVisitor<SCCPSolver> {
-  const DataLayout *DL;
+  const DataLayout &DL;
   const TargetLibraryInfo *TLI;
   SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
   DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
@@ -206,8 +206,8 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   typedef std::pair<BasicBlock*, BasicBlock*> Edge;
   DenseSet<Edge> KnownFeasibleEdges;
 public:
-  SCCPSolver(const DataLayout *DL, const TargetLibraryInfo *tli)
-    : DL(DL), TLI(tli) {}
+  SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
+      : DL(DL), TLI(tli) {}
 
   /// MarkBlockExecutable - This method can be used by clients to mark all of
   /// the blocks that are known to be intrinsically live in the processed unit.
@@ -1561,8 +1561,7 @@ bool SCCP::runOnFunction(Function &F) {
     return false;
 
   DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
-  const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
+  const DataLayout &DL = F.getParent()->getDataLayout();
   const TargetLibraryInfo *TLI =
       &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   SCCPSolver Solver(DL, TLI);
@@ -1691,8 +1690,7 @@ static bool AddressIsTaken(const GlobalValue *GV) {
 }
 
 bool IPSCCP::runOnModule(Module &M) {
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
+  const DataLayout &DL = M.getDataLayout();
   const TargetLibraryInfo *TLI =
       &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   SCCPSolver Solver(DL, TLI);
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index f69c750..06b000f 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -247,7 +247,7 @@ public:
   /// hold.
   void insert(ArrayRef<Slice> NewSlices) {
     int OldSize = Slices.size();
-    std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices));
+    Slices.append(NewSlices.begin(), NewSlices.end());
     auto SliceI = Slices.begin() + OldSize;
     std::sort(SliceI, Slices.end());
     std::inplace_merge(Slices.begin(), SliceI, Slices.end());
@@ -701,6 +701,7 @@ private:
       // by writing out the code here where we have tho underlying allocation
       // size readily available.
       APInt GEPOffset = Offset;
+      const DataLayout &DL = GEPI.getModule()->getDataLayout();
       for (gep_type_iterator GTI = gep_type_begin(GEPI),
                              GTE = gep_type_end(GEPI);
            GTI != GTE; ++GTI) {
@@ -750,6 +751,7 @@ private:
     if (!IsOffsetKnown)
       return PI.setAborted(&LI);
 
+    const DataLayout &DL = LI.getModule()->getDataLayout();
     uint64_t Size = DL.getTypeStoreSize(LI.getType());
     return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
   }
@@ -761,6 +763,7 @@ private:
     if (!IsOffsetKnown)
       return PI.setAborted(&SI);
 
+    const DataLayout &DL = SI.getModule()->getDataLayout();
     uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
 
     // If this memory access can be shown to *statically* extend outside the
@@ -898,6 +901,7 @@ private:
     SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
     Visited.insert(Root);
     Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+    const DataLayout &DL = Root->getModule()->getDataLayout();
     // If there are no loads or stores, the access is dead. We mark that as
     // a size zero access.
     Size = 0;
@@ -1194,7 +1198,6 @@ class SROA : public FunctionPass {
   const bool RequiresDomTree;
 
   LLVMContext *C;
-  const DataLayout *DL;
   DominatorTree *DT;
   AssumptionCache *AC;
 
@@ -1243,7 +1246,7 @@ class SROA : public FunctionPass {
 public:
   SROA(bool RequiresDomTree = true)
       : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr),
-        DL(nullptr), DT(nullptr) {
+        DT(nullptr) {
     initializeSROAPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override;
@@ -1349,7 +1352,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
 ///
 /// FIXME: This should be hoisted into a generic utility, likely in
 /// Transforms/Util/Local.h
-static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) {
+static bool isSafePHIToSpeculate(PHINode &PN) {
   // For now, we can only do this promotion if the load is in the same block
   // as the PHI, and if there are no stores between the phi and load.
   // TODO: Allow recursive phi users.
@@ -1381,6 +1384,8 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) {
   if (!HaveLoad)
     return false;
 
+  const DataLayout &DL = PN.getModule()->getDataLayout();
+
   // We can only transform this if it is safe to push the loads into the
   // predecessor blocks. The only thing to watch out for is that we can't put
   // a possibly trapping load in the predecessor if it is a critical edge.
@@ -1403,7 +1408,7 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) {
     // is already a load in the block, then we can move the load to the pred
     // block.
     if (InVal->isDereferenceablePointer(DL) ||
-        isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL))
+        isSafeToLoadUnconditionally(InVal, TI, MaxAlign))
       continue;
 
     return false;
@@ -1468,10 +1473,10 @@ static void speculatePHINodeLoads(PHINode &PN) {
 ///
 /// We can do this to a select if its only uses are loads and if the operand
 /// to the select can be loaded unconditionally.
-static bool isSafeSelectToSpeculate(SelectInst &SI,
-                                    const DataLayout *DL = nullptr) {
+static bool isSafeSelectToSpeculate(SelectInst &SI) {
   Value *TValue = SI.getTrueValue();
   Value *FValue = SI.getFalseValue();
+  const DataLayout &DL = SI.getModule()->getDataLayout();
   bool TDerefable = TValue->isDereferenceablePointer(DL);
   bool FDerefable = FValue->isDereferenceablePointer(DL);
 
@@ -1484,10 +1489,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI,
     // absolutely (e.g. allocas) or at this point because we can see other
     // accesses to it.
     if (!TDerefable &&
-        !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL))
+        !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment()))
       return false;
     if (!FDerefable &&
-        !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL))
+        !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment()))
       return false;
   }
 
@@ -3699,6 +3704,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
   // them to the alloca slices.
   SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
   std::vector<LoadInst *> SplitLoads;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
   for (LoadInst *LI : Loads) {
     SplitLoads.clear();
 
@@ -3724,10 +3730,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
       auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
       LoadInst *PLoad = IRB.CreateAlignedLoad(
-          getAdjustedPtr(IRB, *DL, BasePtr,
-                         APInt(DL->getPointerSizeInBits(), PartOffset),
+          getAdjustedPtr(IRB, DL, BasePtr,
+                         APInt(DL.getPointerSizeInBits(), PartOffset),
                          PartPtrTy, BasePtr->getName() + "."),
-          getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+          getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
           LI->getName());
 
       // Append this load onto the list of split loads so we can find it later
@@ -3777,10 +3783,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
             PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
 
         StoreInst *PStore = IRB.CreateAlignedStore(
-            PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
-                                  APInt(DL->getPointerSizeInBits(), PartOffset),
+            PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
+                                  APInt(DL.getPointerSizeInBits(), PartOffset),
                                   PartPtrTy, StoreBasePtr->getName() + "."),
-            getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+            getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
         (void)PStore;
         DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
       }
@@ -3857,20 +3863,20 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       } else {
         IRB.SetInsertPoint(BasicBlock::iterator(LI));
         PLoad = IRB.CreateAlignedLoad(
-            getAdjustedPtr(IRB, *DL, LoadBasePtr,
-                           APInt(DL->getPointerSizeInBits(), PartOffset),
+            getAdjustedPtr(IRB, DL, LoadBasePtr,
+                           APInt(DL.getPointerSizeInBits(), PartOffset),
                            PartPtrTy, LoadBasePtr->getName() + "."),
-            getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+            getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
             LI->getName());
       }
 
       // And store this partition.
       IRB.SetInsertPoint(BasicBlock::iterator(SI));
       StoreInst *PStore = IRB.CreateAlignedStore(
-          PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
-                                APInt(DL->getPointerSizeInBits(), PartOffset),
+          PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
+                                APInt(DL.getPointerSizeInBits(), PartOffset),
                                 PartPtrTy, StoreBasePtr->getName() + "."),
-          getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+          getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
 
       // Now build a new slice for the alloca.
       NewSlices.push_back(
@@ -3970,25 +3976,26 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   // won't always succeed, in which case we fall back to a legal integer type
   // or an i8 array of an appropriate size.
   Type *SliceTy = nullptr;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
   if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
-    if (DL->getTypeAllocSize(CommonUseTy) >= P.size())
+    if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
       SliceTy = CommonUseTy;
   if (!SliceTy)
-    if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(),
+    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
                                                  P.beginOffset(), P.size()))
       SliceTy = TypePartitionTy;
   if ((!SliceTy || (SliceTy->isArrayTy() &&
                     SliceTy->getArrayElementType()->isIntegerTy())) &&
-      DL->isLegalInteger(P.size() * 8))
+      DL.isLegalInteger(P.size() * 8))
     SliceTy = Type::getIntNTy(*C, P.size() * 8);
   if (!SliceTy)
     SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
-  assert(DL->getTypeAllocSize(SliceTy) >= P.size());
+  assert(DL.getTypeAllocSize(SliceTy) >= P.size());
 
-  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL);
+  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
 
   VectorType *VecTy =
-      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL);
+      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
   if (VecTy)
     SliceTy = VecTy;
 
@@ -4010,12 +4017,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       // The minimum alignment which users can rely on when the explicit
       // alignment is omitted or zero is that required by the ABI for this
       // type.
-      Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
+      Alignment = DL.getABITypeAlignment(AI.getAllocatedType());
     }
     Alignment = MinAlign(Alignment, P.beginOffset());
     // If we will get at least this much alignment from the type alone, leave
     // the alloca's alignment unconstrained.
-    if (Alignment <= DL->getABITypeAlignment(SliceTy))
+    if (Alignment <= DL.getABITypeAlignment(SliceTy))
       Alignment = 0;
     NewAI = new AllocaInst(
         SliceTy, nullptr, Alignment,
@@ -4035,7 +4042,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   SmallPtrSet<PHINode *, 8> PHIUsers;
   SmallPtrSet<SelectInst *, 8> SelectUsers;
 
-  AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(),
+  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
                                P.endOffset(), IsIntegerPromotable, VecTy,
                                PHIUsers, SelectUsers);
   bool Promotable = true;
@@ -4057,7 +4064,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
                                             E = PHIUsers.end();
        I != E; ++I)
-    if (!isSafePHIToSpeculate(**I, DL)) {
+    if (!isSafePHIToSpeculate(**I)) {
       Promotable = false;
       PHIUsers.clear();
       SelectUsers.clear();
@@ -4066,7 +4073,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
                                                E = SelectUsers.end();
        I != E; ++I)
-    if (!isSafeSelectToSpeculate(**I, DL)) {
+    if (!isSafeSelectToSpeculate(**I)) {
       Promotable = false;
       PHIUsers.clear();
       SelectUsers.clear();
@@ -4110,6 +4117,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
 
   unsigned NumPartitions = 0;
   bool Changed = false;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
 
   // First try to pre-split loads and stores.
   Changed |= presplitLoadsAndStores(AI, AS);
@@ -4127,7 +4135,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
     // confident that the above handling of splittable loads and stores is
     // completely sufficient before we forcibly disable the remaining handling.
     if (S.beginOffset() == 0 &&
-        S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType()))
+        S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
       continue;
     if (isa<LoadInst>(S.getUse()->getUser()) ||
         isa<StoreInst>(S.getUse()->getUser())) {
@@ -4155,7 +4163,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
       Changed = true;
       if (NewAI != &AI) {
         uint64_t SizeOfByte = 8;
-        uint64_t AllocaSize = DL->getTypeSizeInBits(NewAI->getAllocatedType());
+        uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
         // Don't include any padding.
         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
         Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
@@ -4236,21 +4244,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
     AI.eraseFromParent();
     return true;
   }
+  const DataLayout &DL = AI.getModule()->getDataLayout();
 
   // Skip alloca forms that this analysis can't handle.
   if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
-      DL->getTypeAllocSize(AI.getAllocatedType()) == 0)
+      DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
     return false;
 
   bool Changed = false;
 
   // First, split any FCA loads and stores touching this alloca to promote
   // better splitting and promotion opportunities.
-  AggLoadStoreRewriter AggRewriter(*DL);
+  AggLoadStoreRewriter AggRewriter(DL);
   Changed |= AggRewriter.rewrite(AI);
 
   // Build the slices using a recursive instruction-visiting builder.
-  AllocaSlices AS(*DL, AI);
+  AllocaSlices AS(DL, AI);
   DEBUG(AS.print(dbgs()));
   if (AS.isEscaped())
     return Changed;
@@ -4423,12 +4432,6 @@ bool SROA::runOnFunction(Function &F) {
 
   DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   C = &F.getContext();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP) {
-    DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
-    return false;
-  }
-  DL = &DLP->getDataLayout();
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
index c7232a9..3e7cf04 100644
--- a/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -217,6 +217,9 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) {
 /// \returns The profiled weight of I.
 unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) {
   DebugLoc DLoc = Inst.getDebugLoc();
+  if (DLoc.isUnknown())
+    return 0;
+
   unsigned Lineno = DLoc.getLine();
   if (Lineno < HeaderLineno)
     return 0;
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 621633b..6cc8411 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -48,6 +48,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopDeletionPass(Registry);
   initializeLoopAccessAnalysisPass(Registry);
   initializeLoopInstSimplifyPass(Registry);
+  initializeLoopInterchangePass(Registry);
   initializeLoopRotatePass(Registry);
   initializeLoopStrengthReducePass(Registry);
   initializeLoopRerollPass(Registry);
@@ -209,7 +210,6 @@ void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
 
 void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createVerifierPass());
-  // FIXME: should this also add createDebugInfoVerifierPass()?
 }
 
 void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 5c49a55..acd8585 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -89,7 +89,6 @@ namespace {
 
   private:
     bool HasDomTree;
-    const DataLayout *DL;
 
     /// DeadInsts - Keep track of instructions we have made dead, so that
     /// we can remove them after we are done working.
@@ -159,9 +158,10 @@ namespace {
     void isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
                          Type *MemOpType, bool isStore, AllocaInfo &Info,
                          Instruction *TheAccess, bool AllowWholeAccess);
-    bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size);
-    uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset,
-                                  Type *&IdxTy);
+    bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size,
+                          const DataLayout &DL);
+    uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy,
+                                  const DataLayout &DL);
 
     void DoScalarReplacement(AllocaInst *AI,
                              std::vector<AllocaInst*> &WorkList);
@@ -699,9 +699,9 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       // If the source and destination are both to the same alloca, then this is
       // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
       // as appropriate.
-      AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &DL, 0));
+      AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, DL, 0));
 
-      if (GetUnderlyingObject(MTI->getSource(), &DL, 0) != OrigAI) {
+      if (GetUnderlyingObject(MTI->getSource(), DL, 0) != OrigAI) {
         // Dest must be OrigAI, change this to be a load from the original
         // pointer (bitcasted), then a store to our new alloca.
         assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
@@ -717,7 +717,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval");
         SrcVal->setAlignment(MTI->getAlignment());
         Builder.CreateStore(SrcVal, NewAI);
-      } else if (GetUnderlyingObject(MTI->getDest(), &DL, 0) != OrigAI) {
+      } else if (GetUnderlyingObject(MTI->getDest(), DL, 0) != OrigAI) {
         // Src must be OrigAI, change this to be a load from NewAI then a store
         // through the original dest pointer (bitcasted).
         assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
@@ -1032,17 +1032,8 @@ bool SROA::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
 
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
-
   bool Changed = performPromotion(F);
 
-  // FIXME: ScalarRepl currently depends on DataLayout more than it
-  // theoretically needs to. It should be refactored in order to support
-  // target-independent IR. Until this is done, just skip the actual
-  // scalar-replacement portion of this pass.
-  if (!DL) return Changed;
-
   while (1) {
     bool LocalChange = performScalarRepl(F);
     if (!LocalChange) break;   // No need to repromote if no scalarrepl
@@ -1148,7 +1139,8 @@ public:
 ///
 /// We can do this to a select if its only uses are loads and if the operand to
 /// the select can be loaded unconditionally.
-static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) {
+static bool isSafeSelectToSpeculate(SelectInst *SI) {
+  const DataLayout &DL = SI->getModule()->getDataLayout();
   bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(DL);
   bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(DL);
 
@@ -1158,11 +1150,13 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) {
 
     // Both operands to the select need to be dereferencable, either absolutely
     // (e.g. allocas) or at this point because we can see other accesses to it.
-    if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
-                                                    LI->getAlignment(), DL))
+    if (!TDerefable &&
+        !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
+                                     LI->getAlignment()))
       return false;
-    if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI,
-                                                    LI->getAlignment(), DL))
+    if (!FDerefable &&
+        !isSafeToLoadUnconditionally(SI->getFalseValue(), LI,
+                                     LI->getAlignment()))
       return false;
   }
 
@@ -1185,7 +1179,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) {
 ///
 /// We can do this to a select if its only uses are loads and if the operand to
 /// the select can be loaded unconditionally.
-static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) {
+static bool isSafePHIToSpeculate(PHINode *PN) {
   // For now, we can only do this promotion if the load is in the same block as
   // the PHI, and if there are no stores between the phi and load.
   // TODO: Allow recursive phi users.
@@ -1209,6 +1203,8 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) {
     MaxAlign = std::max(MaxAlign, LI->getAlignment());
   }
 
+  const DataLayout &DL = PN->getModule()->getDataLayout();
+
   // Okay, we know that we have one or more loads in the same block as the PHI.
   // We can transform this if it is safe to push the loads into the predecessor
   // blocks.  The only thing to watch out for is that we can't put a possibly
@@ -1234,7 +1230,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) {
     // If this pointer is always safe to load, or if we can prove that there is
     // already a load in the block, then we can move the load to the pred block.
     if (InVal->isDereferenceablePointer(DL) ||
-        isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, DL))
+        isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign))
       continue;
 
     return false;
@@ -1248,7 +1244,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) {
 /// direct (non-volatile) loads and stores to it.  If the alloca is close but
 /// not quite there, this will transform the code to allow promotion.  As such,
 /// it is a non-pure predicate.
-static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
+static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout &DL) {
   SetVector<Instruction*, SmallVector<Instruction*, 4>,
             SmallPtrSet<Instruction*, 4> > InstsToRewrite;
   for (User *U : AI->users()) {
@@ -1279,7 +1275,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
 
       // If it is safe to turn "load (select c, AI, ptr)" into a select of two
       // loads, then we can transform this by rewriting the select.
-      if (!isSafeSelectToSpeculate(SI, DL))
+      if (!isSafeSelectToSpeculate(SI))
         return false;
 
       InstsToRewrite.insert(SI);
@@ -1294,7 +1290,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
 
       // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads
       // in the pred blocks, then we can transform this by rewriting the PHI.
-      if (!isSafePHIToSpeculate(PN, DL))
+      if (!isSafePHIToSpeculate(PN))
         return false;
 
       InstsToRewrite.insert(PN);
@@ -1416,6 +1412,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
 
 bool SROA::performPromotion(Function &F) {
   std::vector<AllocaInst*> Allocas;
+  const DataLayout &DL = F.getParent()->getDataLayout();
   DominatorTree *DT = nullptr;
   if (HasDomTree)
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -1479,6 +1476,7 @@ bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
 //
 bool SROA::performScalarRepl(Function &F) {
   std::vector<AllocaInst*> WorkList;
+  const DataLayout &DL = F.getParent()->getDataLayout();
 
   // Scan the entry basic block, adding allocas to the worklist.
   BasicBlock &BB = F.getEntryBlock();
@@ -1508,7 +1506,7 @@ bool SROA::performScalarRepl(Function &F) {
     // transform the allocation instruction if it is an array allocation
     // (allocations OF arrays are ok though), and an allocation of a scalar
     // value cannot be decomposed at all.
-    uint64_t AllocaSize = DL->getTypeAllocSize(AI->getAllocatedType());
+    uint64_t AllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
 
     // Do not promote [0 x %struct].
     if (AllocaSize == 0) continue;
@@ -1531,8 +1529,9 @@ bool SROA::performScalarRepl(Function &F) {
     // promoted itself.  If so, we don't want to transform it needlessly.  Note
     // that we can't just check based on the type: the alloca may be of an i32
     // but that has pointer arithmetic to set byte 3 of it or something.
-    if (AllocaInst *NewAI = ConvertToScalarInfo(
-              (unsigned)AllocaSize, *DL, ScalarLoadThreshold).TryConvert(AI)) {
+    if (AllocaInst *NewAI =
+            ConvertToScalarInfo((unsigned)AllocaSize, DL, ScalarLoadThreshold)
+                .TryConvert(AI)) {
       NewAI->takeName(AI);
       AI->eraseFromParent();
       ++NumConverted;
@@ -1610,6 +1609,7 @@ void SROA::DeleteDeadInstructions() {
 /// referenced by this instruction.
 void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
                                AllocaInfo &Info) {
+  const DataLayout &DL = I->getModule()->getDataLayout();
   for (Use &U : I->uses()) {
     Instruction *User = cast<Instruction>(U.getUser());
 
@@ -1632,8 +1632,8 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
       if (!LI->isSimple())
         return MarkUnsafe(Info, User);
       Type *LIType = LI->getType();
-      isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType),
-                      LIType, false, Info, LI, true /*AllowWholeAccess*/);
+      isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info,
+                      LI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
 
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
@@ -1642,8 +1642,8 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
         return MarkUnsafe(Info, User);
 
       Type *SIType = SI->getOperand(0)->getType();
-      isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType),
-                      SIType, true, Info, SI, true /*AllowWholeAccess*/);
+      isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info,
+                      SI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
     } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
       if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
@@ -1675,6 +1675,7 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
     if (!Info.CheckedPHIs.insert(PN).second)
       return;
 
+  const DataLayout &DL = I->getModule()->getDataLayout();
   for (User *U : I->users()) {
     Instruction *UI = cast<Instruction>(U);
 
@@ -1691,8 +1692,8 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
       if (!LI->isSimple())
         return MarkUnsafe(Info, UI);
       Type *LIType = LI->getType();
-      isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType),
-                      LIType, false, Info, LI, false /*AllowWholeAccess*/);
+      isSafeMemAccess(Offset, DL.getTypeAllocSize(LIType), LIType, false, Info,
+                      LI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
 
     } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
@@ -1701,8 +1702,8 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
         return MarkUnsafe(Info, UI);
 
       Type *SIType = SI->getOperand(0)->getType();
-      isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType),
-                      SIType, true, Info, SI, false /*AllowWholeAccess*/);
+      isSafeMemAccess(Offset, DL.getTypeAllocSize(SIType), SIType, true, Info,
+                      SI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
     } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) {
       isSafePHISelectUseForScalarRepl(UI, Offset, Info);
@@ -1746,9 +1747,11 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
   // constant part of the offset.
   if (NonConstant)
     Indices.pop_back();
-  Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
-  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset,
-                        NonConstantIdxSize))
+
+  const DataLayout &DL = GEPI->getModule()->getDataLayout();
+  Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, NonConstantIdxSize,
+                        DL))
     MarkUnsafe(Info, GEPI);
 }
 
@@ -1803,9 +1806,10 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
                            Type *MemOpType, bool isStore,
                            AllocaInfo &Info, Instruction *TheAccess,
                            bool AllowWholeAccess) {
+  const DataLayout &DL = TheAccess->getModule()->getDataLayout();
   // Check if this is a load/store of the entire alloca.
   if (Offset == 0 && AllowWholeAccess &&
-      MemSize == DL->getTypeAllocSize(Info.AI->getAllocatedType())) {
+      MemSize == DL.getTypeAllocSize(Info.AI->getAllocatedType())) {
     // This can be safe for MemIntrinsics (where MemOpType is 0) and integer
     // loads/stores (which are essentially the same as the MemIntrinsics with
     // regard to copying padding between elements).  But, if an alloca is
@@ -1828,7 +1832,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
   }
   // Check if the offset/size correspond to a component within the alloca type.
   Type *T = Info.AI->getAllocatedType();
-  if (TypeHasComponent(T, Offset, MemSize)) {
+  if (TypeHasComponent(T, Offset, MemSize, DL)) {
     Info.hasSubelementAccess = true;
     return;
   }
@@ -1838,24 +1842,25 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
 
 /// TypeHasComponent - Return true if T has a component type with the
 /// specified offset and size.  If Size is zero, do not check the size.
-bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
+bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size,
+                            const DataLayout &DL) {
   Type *EltTy;
   uint64_t EltSize;
   if (StructType *ST = dyn_cast<StructType>(T)) {
-    const StructLayout *Layout = DL->getStructLayout(ST);
+    const StructLayout *Layout = DL.getStructLayout(ST);
     unsigned EltIdx = Layout->getElementContainingOffset(Offset);
     EltTy = ST->getContainedType(EltIdx);
-    EltSize = DL->getTypeAllocSize(EltTy);
+    EltSize = DL.getTypeAllocSize(EltTy);
     Offset -= Layout->getElementOffset(EltIdx);
   } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
     EltTy = AT->getElementType();
-    EltSize = DL->getTypeAllocSize(EltTy);
+    EltSize = DL.getTypeAllocSize(EltTy);
     if (Offset >= AT->getNumElements() * EltSize)
       return false;
     Offset %= EltSize;
   } else if (VectorType *VT = dyn_cast<VectorType>(T)) {
     EltTy = VT->getElementType();
-    EltSize = DL->getTypeAllocSize(EltTy);
+    EltSize = DL.getTypeAllocSize(EltTy);
     if (Offset >= VT->getNumElements() * EltSize)
       return false;
     Offset %= EltSize;
@@ -1867,7 +1872,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
   // Check if the component spans multiple elements.
   if (Offset + Size > EltSize)
     return false;
-  return TypeHasComponent(EltTy, Offset, Size);
+  return TypeHasComponent(EltTy, Offset, Size, DL);
 }
 
 /// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite
@@ -1876,6 +1881,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
 /// instruction.
 void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
                                 SmallVectorImpl<AllocaInst *> &NewElts) {
+  const DataLayout &DL = I->getModule()->getDataLayout();
   for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) {
     Use &TheUse = *UI++;
     Instruction *User = cast<Instruction>(TheUse.getUser());
@@ -1893,8 +1899,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
       uint64_t MemSize = Length->getZExtValue();
-      if (Offset == 0 &&
-          MemSize == DL->getTypeAllocSize(AI->getAllocatedType()))
+      if (Offset == 0 && MemSize == DL.getTypeAllocSize(AI->getAllocatedType()))
         RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts);
       // Otherwise the intrinsic can only touch a single element and the
       // address operand will be updated, so nothing else needs to be done.
@@ -1930,8 +1935,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
         LI->replaceAllUsesWith(Insert);
         DeadInsts.push_back(LI);
       } else if (LIType->isIntegerTy() &&
-                 DL->getTypeAllocSize(LIType) ==
-                 DL->getTypeAllocSize(AI->getAllocatedType())) {
+                 DL.getTypeAllocSize(LIType) ==
+                     DL.getTypeAllocSize(AI->getAllocatedType())) {
         // If this is a load of the entire alloca to an integer, rewrite it.
         RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
       }
@@ -1957,8 +1962,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
         }
         DeadInsts.push_back(SI);
       } else if (SIType->isIntegerTy() &&
-                 DL->getTypeAllocSize(SIType) ==
-                 DL->getTypeAllocSize(AI->getAllocatedType())) {
+                 DL.getTypeAllocSize(SIType) ==
+                     DL.getTypeAllocSize(AI->getAllocatedType())) {
         // If this is a store of the entire alloca from an integer, rewrite it.
         RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
       }
@@ -2001,7 +2006,8 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
   Type *T = AI->getAllocatedType();
   uint64_t EltOffset = 0;
   Type *IdxTy;
-  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy);
+  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy,
+                                      BC->getModule()->getDataLayout());
   Instruction *Val = NewElts[Idx];
   if (Val->getType() != BC->getDestTy()) {
     Val = new BitCastInst(Val, BC->getDestTy(), "", BC);
@@ -2016,11 +2022,12 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
 /// Sets T to the type of the element and Offset to the offset within that
 /// element.  IdxTy is set to the type of the index result to be used in a
 /// GEP instruction.
-uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
-                                    Type *&IdxTy) {
+uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Type *&IdxTy,
+                                    const DataLayout &DL) {
   uint64_t Idx = 0;
+
   if (StructType *ST = dyn_cast<StructType>(T)) {
-    const StructLayout *Layout = DL->getStructLayout(ST);
+    const StructLayout *Layout = DL.getStructLayout(ST);
     Idx = Layout->getElementContainingOffset(Offset);
     T = ST->getContainedType(Idx);
     Offset -= Layout->getElementOffset(Idx);
@@ -2028,7 +2035,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
     return Idx;
   } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
     T = AT->getElementType();
-    uint64_t EltSize = DL->getTypeAllocSize(T);
+    uint64_t EltSize = DL.getTypeAllocSize(T);
     Idx = Offset / EltSize;
     Offset -= Idx * EltSize;
     IdxTy = Type::getInt64Ty(T->getContext());
@@ -2036,7 +2043,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
   }
   VectorType *VT = cast<VectorType>(T);
   T = VT->getElementType();
-  uint64_t EltSize = DL->getTypeAllocSize(T);
+  uint64_t EltSize = DL.getTypeAllocSize(T);
   Idx = Offset / EltSize;
   Offset -= Idx * EltSize;
   IdxTy = Type::getInt64Ty(T->getContext());
@@ -2049,6 +2056,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
 void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                       SmallVectorImpl<AllocaInst *> &NewElts) {
   uint64_t OldOffset = Offset;
+  const DataLayout &DL = GEPI->getModule()->getDataLayout();
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
   // If the GEP was dynamic then it must have been a dynamic vector lookup.
   // In this case, it must be the last GEP operand which is dynamic so keep that
@@ -2057,19 +2065,19 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   Value* NonConstantIdx = nullptr;
   if (!GEPI->hasAllConstantIndices())
     NonConstantIdx = Indices.pop_back_val();
-  Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+  Offset += DL.getIndexedOffset(GEPI->getPointerOperandType(), Indices);
 
   RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
 
   Type *T = AI->getAllocatedType();
   Type *IdxTy;
-  uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy);
+  uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy, DL);
   if (GEPI->getOperand(0) == AI)
     OldIdx = ~0ULL; // Force the GEP to be rewritten.
 
   T = AI->getAllocatedType();
   uint64_t EltOffset = Offset;
-  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy);
+  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy, DL);
 
   // If this GEP does not move the pointer across elements of the alloca
   // being split, then it does not needs to be rewritten.
@@ -2080,7 +2088,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   SmallVector<Value*, 8> NewArgs;
   NewArgs.push_back(Constant::getNullValue(i32Ty));
   while (EltOffset != 0) {
-    uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy);
+    uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy, DL);
     NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx));
   }
   if (NonConstantIdx) {
@@ -2114,9 +2122,10 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
   // Put matching lifetime markers on everything from Offset up to
   // Offset+OldSize.
   Type *AIType = AI->getAllocatedType();
+  const DataLayout &DL = II->getModule()->getDataLayout();
   uint64_t NewOffset = Offset;
   Type *IdxTy;
-  uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy);
+  uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy, DL);
 
   IRBuilder<> Builder(II);
   uint64_t Size = OldSize->getLimitedValue();
@@ -2129,7 +2138,7 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
     V = Builder.CreateGEP(V, Builder.getInt64(NewOffset));
 
     IdxTy = NewElts[Idx]->getAllocatedType();
-    uint64_t EltSize = DL->getTypeAllocSize(IdxTy) - NewOffset;
+    uint64_t EltSize = DL.getTypeAllocSize(IdxTy) - NewOffset;
     if (EltSize > Size) {
       EltSize = Size;
       Size = 0;
@@ -2145,7 +2154,7 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
 
   for (; Idx != NewElts.size() && Size; ++Idx) {
     IdxTy = NewElts[Idx]->getAllocatedType();
-    uint64_t EltSize = DL->getTypeAllocSize(IdxTy);
+    uint64_t EltSize = DL.getTypeAllocSize(IdxTy);
     if (EltSize > Size) {
       EltSize = Size;
       Size = 0;
@@ -2221,6 +2230,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
   bool SROADest = MI->getRawDest() == Inst;
 
   Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext()));
+  const DataLayout &DL = MI->getModule()->getDataLayout();
 
   for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
     // If this is a memcpy/memmove, emit a GEP of the other element address.
@@ -2237,10 +2247,10 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
       Type *OtherTy = OtherPtrTy->getElementType();
       if (StructType *ST = dyn_cast<StructType>(OtherTy)) {
-        EltOffset = DL->getStructLayout(ST)->getElementOffset(i);
+        EltOffset = DL.getStructLayout(ST)->getElementOffset(i);
       } else {
         Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
-        EltOffset = DL->getTypeAllocSize(EltTy)*i;
+        EltOffset = DL.getTypeAllocSize(EltTy) * i;
       }
 
       // The alignment of the other pointer is the guaranteed alignment of the
@@ -2281,7 +2291,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
           Type *ValTy = EltTy->getScalarType();
 
           // Construct an integer with the right value.
-          unsigned EltSize = DL->getTypeSizeInBits(ValTy);
+          unsigned EltSize = DL.getTypeSizeInBits(ValTy);
           APInt OneVal(EltSize, CI->getZExtValue());
           APInt TotalVal(OneVal);
           // Set each byte.
@@ -2311,7 +2321,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       // this element.
     }
 
-    unsigned EltSize = DL->getTypeAllocSize(EltTy);
+    unsigned EltSize = DL.getTypeAllocSize(EltTy);
     if (!EltSize)
       continue;
 
@@ -2345,12 +2355,13 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   // and store the element value to the individual alloca.
   Value *SrcVal = SI->getOperand(0);
   Type *AllocaEltTy = AI->getAllocatedType();
-  uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy);
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy);
 
   IRBuilder<> Builder(SI);
 
   // Handle tail padding by extending the operand
-  if (DL->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
+  if (DL.getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
     SrcVal = Builder.CreateZExt(SrcVal,
                             IntegerType::get(SI->getContext(), AllocaSizeBits));
 
@@ -2360,15 +2371,15 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   // There are two forms here: AI could be an array or struct.  Both cases
   // have different ways to compute the element offset.
   if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
-    const StructLayout *Layout = DL->getStructLayout(EltSTy);
+    const StructLayout *Layout = DL.getStructLayout(EltSTy);
 
     for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
       // Get the number of bits to shift SrcVal to get the value.
       Type *FieldTy = EltSTy->getElementType(i);
       uint64_t Shift = Layout->getElementOffsetInBits(i);
 
-      if (DL->isBigEndian())
-        Shift = AllocaSizeBits-Shift-DL->getTypeAllocSizeInBits(FieldTy);
+      if (DL.isBigEndian())
+        Shift = AllocaSizeBits - Shift - DL.getTypeAllocSizeInBits(FieldTy);
 
       Value *EltVal = SrcVal;
       if (Shift) {
@@ -2377,7 +2388,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
       }
 
       // Truncate down to an integer of the right size.
-      uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy);
+      uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy);
 
       // Ignore zero sized fields like {}, they obviously contain no data.
       if (FieldSizeBits == 0) continue;
@@ -2402,12 +2413,12 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   } else {
     ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
     Type *ArrayEltTy = ATy->getElementType();
-    uint64_t ElementOffset = DL->getTypeAllocSizeInBits(ArrayEltTy);
-    uint64_t ElementSizeBits = DL->getTypeSizeInBits(ArrayEltTy);
+    uint64_t ElementOffset = DL.getTypeAllocSizeInBits(ArrayEltTy);
+    uint64_t ElementSizeBits = DL.getTypeSizeInBits(ArrayEltTy);
 
     uint64_t Shift;
 
-    if (DL->isBigEndian())
+    if (DL.isBigEndian())
       Shift = AllocaSizeBits-ElementOffset;
     else
       Shift = 0;
@@ -2441,7 +2452,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
       }
       new StoreInst(EltVal, DestField, SI);
 
-      if (DL->isBigEndian())
+      if (DL.isBigEndian())
         Shift -= ElementOffset;
       else
         Shift += ElementOffset;
@@ -2459,7 +2470,8 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
   // Extract each element out of the NewElts according to its structure offset
   // and form the result value.
   Type *AllocaEltTy = AI->getAllocatedType();
-  uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy);
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  uint64_t AllocaSizeBits = DL.getTypeAllocSizeInBits(AllocaEltTy);
 
   DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
                << '\n');
@@ -2469,10 +2481,10 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
   const StructLayout *Layout = nullptr;
   uint64_t ArrayEltBitOffset = 0;
   if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
-    Layout = DL->getStructLayout(EltSTy);
+    Layout = DL.getStructLayout(EltSTy);
   } else {
     Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
-    ArrayEltBitOffset = DL->getTypeAllocSizeInBits(ArrayEltTy);
+    ArrayEltBitOffset = DL.getTypeAllocSizeInBits(ArrayEltTy);
   }
 
   Value *ResultVal =
@@ -2484,7 +2496,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
     Value *SrcField = NewElts[i];
     Type *FieldTy =
       cast<PointerType>(SrcField->getType())->getElementType();
-    uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy);
+    uint64_t FieldSizeBits = DL.getTypeSizeInBits(FieldTy);
 
     // Ignore zero sized fields like {}, they obviously contain no data.
     if (FieldSizeBits == 0) continue;
@@ -2515,7 +2527,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
     else  // Array case.
       Shift = i*ArrayEltBitOffset;
 
-    if (DL->isBigEndian())
+    if (DL.isBigEndian())
       Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth();
 
     if (Shift) {
@@ -2532,7 +2544,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
   }
 
   // Handle tail padding by truncating the result
-  if (DL->getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
+  if (DL.getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
     ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
 
   LI->replaceAllUsesWith(ResultVal);
@@ -2589,13 +2601,15 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
     return false;
   }
 
+  const DataLayout &DL = AI->getModule()->getDataLayout();
+
   // Okay, we know all the users are promotable.  If the aggregate is a memcpy
   // source and destination, we have to be careful.  In particular, the memcpy
   // could be moving around elements that live in structure padding of the LLVM
   // types, but may actually be used.  In these cases, we refuse to promote the
   // struct.
   if (Info.isMemCpySrc && Info.isMemCpyDst &&
-      HasPadding(AI->getAllocatedType(), *DL))
+      HasPadding(AI->getAllocatedType(), DL))
     return false;
 
   // If the alloca never has an access to just *part* of it, but is accessed
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index 6036c09..a457cba 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -165,7 +165,7 @@ private:
   void gather(Instruction *, const ValueVector &);
   bool canTransferMetadata(unsigned Kind);
   void transferMetadata(Instruction *, const ValueVector &);
-  bool getVectorLayout(Type *, unsigned, VectorLayout &);
+  bool getVectorLayout(Type *, unsigned, VectorLayout &, const DataLayout &);
   bool finish();
 
   template<typename T> bool splitBinary(Instruction &, const T &);
@@ -173,7 +173,6 @@ private:
   ScatterMap Scattered;
   GatherList Gathered;
   unsigned ParallelLoopAccessMDKind;
-  const DataLayout *DL;
   bool ScalarizeLoadStore;
 };
 
@@ -248,8 +247,6 @@ bool Scalarizer::doInitialization(Module &M) {
 }
 
 bool Scalarizer::runOnFunction(Function &F) {
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
   for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
     BasicBlock *BB = BBI;
     for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
@@ -345,10 +342,7 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
 // Try to fill in Layout from Ty, returning true on success.  Alignment is
 // the alignment of the vector, or 0 if the ABI default should be used.
 bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment,
-                                 VectorLayout &Layout) {
-  if (!DL)
-    return false;
-
+                                 VectorLayout &Layout, const DataLayout &DL) {
   // Make sure we're dealing with a vector.
   Layout.VecTy = dyn_cast<VectorType>(Ty);
   if (!Layout.VecTy)
@@ -356,15 +350,15 @@ bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment,
 
   // Check that we're dealing with full-byte elements.
   Layout.ElemTy = Layout.VecTy->getElementType();
-  if (DL->getTypeSizeInBits(Layout.ElemTy) !=
-      DL->getTypeStoreSizeInBits(Layout.ElemTy))
+  if (DL.getTypeSizeInBits(Layout.ElemTy) !=
+      DL.getTypeStoreSizeInBits(Layout.ElemTy))
     return false;
 
   if (Alignment)
     Layout.VecAlign = Alignment;
   else
-    Layout.VecAlign = DL->getABITypeAlignment(Layout.VecTy);
-  Layout.ElemSize = DL->getTypeStoreSize(Layout.ElemTy);
+    Layout.VecAlign = DL.getABITypeAlignment(Layout.VecTy);
+  Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy);
   return true;
 }
 
@@ -456,7 +450,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
     Indices.resize(NumIndices);
     for (unsigned J = 0; J < NumIndices; ++J)
       Indices[J] = Ops[J][I];
-    Res[I] = Builder.CreateGEP(Base[I], Indices,
+    Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices,
                                GEPI.getName() + ".i" + Twine(I));
     if (GEPI.isInBounds())
       if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I]))
@@ -595,7 +589,8 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) {
     return false;
 
   VectorLayout Layout;
-  if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout))
+  if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout,
+                       LI.getModule()->getDataLayout()))
     return false;
 
   unsigned NumElems = Layout.VecTy->getNumElements();
@@ -619,7 +614,8 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) {
 
   VectorLayout Layout;
   Value *FullValue = SI.getValueOperand();
-  if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout))
+  if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout,
+                       SI.getModule()->getDataLayout()))
     return false;
 
   unsigned NumElems = Layout.VecTy->getNumElements();
diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index bffe8df..1a04d74 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -199,18 +199,15 @@ class ConstantOffsetExtractor {
   /// new index representing the remainder (equal to the original index minus
   /// the constant offset), or nullptr if we cannot extract a constant offset.
   /// \p Idx    The given GEP index
-  /// \p DL     The datalayout of the module
   /// \p GEP    The given GEP
-  static Value *Extract(Value *Idx, const DataLayout *DL,
-                        GetElementPtrInst *GEP);
+   static Value *Extract(Value *Idx, GetElementPtrInst *GEP);
   /// Looks for a constant offset from the given GEP index without extracting
   /// it. It returns the numeric value of the extracted constant offset (0 if
   /// failed). The meaning of the arguments are the same as Extract.
-  static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP);
+   static int64_t Find(Value *Idx, GetElementPtrInst *GEP);
 
  private:
-  ConstantOffsetExtractor(const DataLayout *Layout, Instruction *InsertionPt)
-      : DL(Layout), IP(InsertionPt) {}
+   ConstantOffsetExtractor(Instruction *InsertionPt) : IP(InsertionPt) {}
   /// Searches the expression that computes V for a non-zero constant C s.t.
   /// V can be reassociated into the form V' + C. If the searching is
   /// successful, returns C and update UserChain as a def-use chain from C to V;
@@ -294,8 +291,6 @@ class ConstantOffsetExtractor {
   /// A data structure used in rebuildWithoutConstOffset. Contains all
   /// sext/zext instructions along UserChain.
   SmallVector<CastInst *, 16> ExtInsts;
-  /// The data layout of the module. Used in ComputeKnownBits.
-  const DataLayout *DL;
   Instruction *IP;  /// Insertion position of cloned instructions.
 };
 
@@ -312,19 +307,10 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DataLayoutPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.setPreservesCFG();
   }
 
-  bool doInitialization(Module &M) override {
-    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-    if (DLP == nullptr)
-      report_fatal_error("data layout missing");
-    DL = &DLP->getDataLayout();
-    return false;
-  }
-
   bool runOnFunction(Function &F) override;
 
  private:
@@ -372,7 +358,6 @@ class SeparateConstOffsetFromGEP : public FunctionPass {
   /// Verified in @i32_add in split-gep.ll
   bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
 
-  const DataLayout *DL;
   const TargetMachine *TM;
   /// Whether to lower a GEP with multiple indices into arithmetic operations or
   /// multiple GEPs with a single index.
@@ -386,7 +371,6 @@ INITIALIZE_PASS_BEGIN(
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DataLayoutPass)
 INITIALIZE_PASS_END(
     SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
@@ -647,9 +631,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
   return BO;
 }
 
-Value *ConstantOffsetExtractor::Extract(Value *Idx, const DataLayout *DL,
-                                        GetElementPtrInst *GEP) {
-  ConstantOffsetExtractor Extractor(DL, GEP);
+Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP) {
+  ConstantOffsetExtractor Extractor(GEP);
   // Find a non-zero constant offset first.
   APInt ConstantOffset =
       Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
@@ -660,10 +643,9 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, const DataLayout *DL,
   return Extractor.rebuildWithoutConstOffset();
 }
 
-int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL,
-      GetElementPtrInst *GEP) {
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) {
   // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
-  return ConstantOffsetExtractor(DL, GEP)
+  return ConstantOffsetExtractor(GEP)
       .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
             GEP->isInBounds())
       .getSExtValue();
@@ -674,6 +656,7 @@ void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne,
   IntegerType *IT = cast<IntegerType>(V->getType());
   KnownOne = APInt(IT->getBitWidth(), 0);
   KnownZero = APInt(IT->getBitWidth(), 0);
+  const DataLayout &DL = IP->getModule()->getDataLayout();
   llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0);
 }
 
@@ -689,7 +672,8 @@ bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const {
 bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
     GetElementPtrInst *GEP) {
   bool Changed = false;
-  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+  const DataLayout &DL = GEP->getModule()->getDataLayout();
+  Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
   gep_type_iterator GTI = gep_type_begin(*GEP);
   for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
        I != E; ++I, ++GTI) {
@@ -710,18 +694,19 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
   NeedsExtraction = false;
   int64_t AccumulativeByteOffset = 0;
   gep_type_iterator GTI = gep_type_begin(*GEP);
+  const DataLayout &DL = GEP->getModule()->getDataLayout();
   for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
     if (isa<SequentialType>(*GTI)) {
       // Tries to extract a constant offset from this GEP index.
       int64_t ConstantOffset =
-          ConstantOffsetExtractor::Find(GEP->getOperand(I), DL, GEP);
+          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP);
       if (ConstantOffset != 0) {
         NeedsExtraction = true;
         // A GEP may have multiple indices.  We accumulate the extracted
         // constant offset to a byte offset, and later offset the remainder of
         // the original GEP with this byte offset.
         AccumulativeByteOffset +=
-            ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+            ConstantOffset * DL.getTypeAllocSize(GTI.getIndexedType());
       }
     } else if (LowerGEP) {
       StructType *StTy = cast<StructType>(*GTI);
@@ -730,7 +715,7 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
       if (Field != 0) {
         NeedsExtraction = true;
         AccumulativeByteOffset +=
-            DL->getStructLayout(StTy)->getElementOffset(Field);
+            DL.getStructLayout(StTy)->getElementOffset(Field);
       }
     }
   }
@@ -740,7 +725,8 @@ SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
 void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
     GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
   IRBuilder<> Builder(Variadic);
-  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+  const DataLayout &DL = Variadic->getModule()->getDataLayout();
+  Type *IntPtrTy = DL.getIntPtrType(Variadic->getType());
 
   Type *I8PtrTy =
       Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
@@ -760,7 +746,7 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
           continue;
 
       APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
-                                DL->getTypeAllocSize(GTI.getIndexedType()));
+                                DL.getTypeAllocSize(GTI.getIndexedType()));
       // Scale the index by element size.
       if (ElementSize != 1) {
         if (ElementSize.isPowerOf2()) {
@@ -791,7 +777,8 @@ void
 SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
                                                int64_t AccumulativeByteOffset) {
   IRBuilder<> Builder(Variadic);
-  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+  const DataLayout &DL = Variadic->getModule()->getDataLayout();
+  Type *IntPtrTy = DL.getIntPtrType(Variadic->getType());
 
   Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
   gep_type_iterator GTI = gep_type_begin(*Variadic);
@@ -807,7 +794,7 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
           continue;
 
       APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
-                                DL->getTypeAllocSize(GTI.getIndexedType()));
+                                DL.getTypeAllocSize(GTI.getIndexedType()));
       // Scale the index by element size.
       if (ElementSize != 1) {
         if (ElementSize.isPowerOf2()) {
@@ -880,8 +867,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
     if (isa<SequentialType>(*GTI)) {
       // Splits this GEP index into a variadic part and a constant offset, and
       // uses the variadic part as the new index.
-      Value *NewIdx =
-          ConstantOffsetExtractor::Extract(GEP->getOperand(I), DL, GEP);
+      Value *NewIdx = ConstantOffsetExtractor::Extract(GEP->getOperand(I), GEP);
       if (NewIdx != nullptr) {
         GEP->setOperand(I, NewIdx);
       }
@@ -958,15 +944,17 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
   // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
   // used with unsigned integers later.
+  const DataLayout &DL = GEP->getModule()->getDataLayout();
   int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
-      DL->getTypeAllocSize(GEP->getType()->getElementType()));
-  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+      DL.getTypeAllocSize(GEP->getType()->getElementType()));
+  Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
   if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
     // Very likely. As long as %gep is natually aligned, the byte offset we
     // extracted should be a multiple of sizeof(*%gep).
     int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
-    NewGEP = GetElementPtrInst::Create(
-        NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP);
+    NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
+                                       ConstantInt::get(IntPtrTy, Index, true),
+                                       GEP->getName(), GEP);
   } else {
     // Unlikely but possible. For example,
     // #pragma pack(1)
@@ -986,8 +974,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
                                        GEP->getPointerAddressSpace());
     NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
     NewGEP = GetElementPtrInst::Create(
-        NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true),
-        "uglygep", GEP);
+        Type::getInt8Ty(GEP->getContext()), NewGEP,
+        ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
+        GEP);
     if (GEP->getType() != I8PtrTy)
       NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
   }
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index fb8fe38..8566cd9 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -127,7 +127,7 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
-                                   const DataLayout *DL, AssumptionCache *AC,
+                                   AssumptionCache *AC,
                                    unsigned BonusInstThreshold) {
   bool Changed = false;
   bool LocalChange = true;
@@ -137,7 +137,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
     // Loop over all of the basic blocks and remove them if they are unneeded...
     //
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AC)) {
+      if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, AC)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -148,11 +148,10 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
-                                const DataLayout *DL, AssumptionCache *AC,
-                                int BonusInstThreshold) {
+                                AssumptionCache *AC, int BonusInstThreshold) {
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -166,7 +165,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, DL, AC, BonusInstThreshold);
+    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
     EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
@@ -181,11 +180,10 @@ SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
 
 PreservedAnalyses SimplifyCFGPass::run(Function &F,
                                        AnalysisManager<Function> *AM) {
-  auto *DL = F.getParent()->getDataLayout();
   auto &TTI = AM->getResult<TargetIRAnalysis>(F);
   auto &AC = AM->getResult<AssumptionAnalysis>(F);
 
-  if (!simplifyFunctionCFG(F, TTI, DL, &AC, BonusInstThreshold))
+  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold))
     return PreservedAnalyses::none();
 
   return PreservedAnalyses::all();
@@ -207,9 +205,7 @@ struct CFGSimplifyPass : public FunctionPass {
         &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-    const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-    return simplifyFunctionCFG(F, TTI, DL, AC, BonusInstThreshold);
+    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index d0ee0a6..b169d56 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -35,7 +36,6 @@ namespace {
     DominatorTree *DT;
     LoopInfo *LI;
     AliasAnalysis *AA;
-    const DataLayout *DL;
 
   public:
     static char ID; // Pass identification
@@ -100,8 +100,6 @@ bool Sinking::runOnFunction(Function &F) {
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   AA = &getAnalysis<AliasAnalysis>();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
 
   bool MadeChange, EverMadeChange = false;
 
@@ -196,7 +194,7 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst,
   if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
-    if (!isSafeToSpeculativelyExecute(Inst, DL))
+    if (!isSafeToSpeculativelyExecute(Inst))
       return false;
 
     // We don't want to sink across a critical edge if we don't dominate the
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 4edc86c..e71031c 100644
--- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -15,19 +15,30 @@
 //
 // There are many optimizations we can perform in the domain of SLSR. This file
 // for now contains only an initial step. Specifically, we look for strength
-// reduction candidate in the form of
+// reduction candidates in two forms:
 //
-// (B + i) * S
+// Form 1: (B + i) * S
+// Form 2: &B[i * S]
 //
-// where B and S are integer constants or variables, and i is a constant
-// integer. If we found two such candidates
+// where S is an integer variable, and i is a constant integer. If we found two
+// candidates
 //
-// S1: X = (B + i) * S S2: Y = (B + i') * S
+// S1: X = (B + i) * S
+// S2: Y = (B + i') * S
+//
+// or
+//
+// S1: X = &B[i * S]
+// S2: Y = &B[i' * S]
 //
 // and S1 dominates S2, we call S1 a basis of S2, and can replace S2 with
 //
 // Y = X + (i' - i) * S
 //
+// or
+//
+// Y = &X[(i' - i) * S]
+//
 // where (i' - i) * S is folded to the extent possible. When S2 has multiple
 // bases, we pick the one that is closest to S2, or S2's "immediate" basis.
 //
@@ -35,8 +46,6 @@
 //
 // - Handle candidates in the form of B + i * S
 //
-// - Handle candidates in the form of pointer arithmetics. e.g., B[i * S]
-//
 // - Floating point arithmetics when fast math is enabled.
 //
 // - SLSR may decrease ILP at the architecture level. Targets that are very
@@ -45,6 +54,10 @@
 #include <vector>
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
@@ -58,14 +71,30 @@ using namespace PatternMatch;
 namespace {
 
 class StraightLineStrengthReduce : public FunctionPass {
- public:
+public:
   // SLSR candidate. Such a candidate must be in the form of
   //   (Base + Index) * Stride
+  // or
+  //   Base[..][Index * Stride][..]
   struct Candidate : public ilist_node<Candidate> {
-    Candidate(Value *B = nullptr, ConstantInt *Idx = nullptr,
-              Value *S = nullptr, Instruction *I = nullptr)
-        : Base(B), Index(Idx), Stride(S), Ins(I), Basis(nullptr) {}
-    Value *Base;
+    enum Kind {
+      Invalid, // reserved for the default constructor
+      Mul,     // (B + i) * S
+      GEP,     // &B[..][i * S][..]
+    };
+
+    Candidate()
+        : CandidateKind(Invalid), Base(nullptr), Index(nullptr),
+          Stride(nullptr), Ins(nullptr), Basis(nullptr) {}
+    Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+              Instruction *I)
+        : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I),
+          Basis(nullptr) {}
+    Kind CandidateKind;
+    const SCEV *Base;
+    // Note that Index and Stride of a GEP candidate may not have the same
+    // integer type. In that case, during rewriting, Stride will be
+    // sign-extended or truncated to Index's type.
     ConstantInt *Index;
     Value *Stride;
     // The instruction this candidate corresponds to. It helps us to rewrite a
@@ -90,33 +119,70 @@ class StraightLineStrengthReduce : public FunctionPass {
 
   static char ID;
 
-  StraightLineStrengthReduce() : FunctionPass(ID), DT(nullptr) {
+  StraightLineStrengthReduce()
+      : FunctionPass(ID), DL(nullptr), DT(nullptr), TTI(nullptr) {
     initializeStraightLineStrengthReducePass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     // We do not modify the shape of the CFG.
     AU.setPreservesCFG();
   }
 
+  bool doInitialization(Module &M) override {
+    DL = &M.getDataLayout();
+    return false;
+  }
+
   bool runOnFunction(Function &F) override;
 
- private:
+private:
   // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
   // share the same base and stride.
   bool isBasisFor(const Candidate &Basis, const Candidate &C);
   // Checks whether I is in a candidate form. If so, adds all the matching forms
   // to Candidates, and tries to find the immediate basis for each of them.
   void allocateCandidateAndFindBasis(Instruction *I);
-  // Given that I is in the form of "(B + Idx) * S", adds this form to
-  // Candidates, and finds its immediate basis.
-  void allocateCandidateAndFindBasis(Value *B, ConstantInt *Idx, Value *S,
+  // Allocate candidates and find bases for Mul instructions.
+  void allocateCandidateAndFindBasisForMul(Instruction *I);
+  // Splits LHS into Base + Index and, if succeeds, calls
+  // allocateCandidateAndFindBasis.
+  void allocateCandidateAndFindBasisForMul(Value *LHS, Value *RHS,
+                                           Instruction *I);
+  // Allocate candidates and find bases for GetElementPtr instructions.
+  void allocateCandidateAndFindBasisForGEP(GetElementPtrInst *GEP);
+  // A helper function that scales Idx with ElementSize before invoking
+  // allocateCandidateAndFindBasis.
+  void allocateCandidateAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
+                                           Value *S, uint64_t ElementSize,
+                                           Instruction *I);
+  // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
+  // basis.
+  void allocateCandidateAndFindBasis(Candidate::Kind CT, const SCEV *B,
+                                     ConstantInt *Idx, Value *S,
                                      Instruction *I);
   // Rewrites candidate C with respect to Basis.
   void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+  // A helper function that factors ArrayIdx to a product of a stride and a
+  // constant index, and invokes allocateCandidateAndFindBasis with the
+  // factorings.
+  void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
+                        GetElementPtrInst *GEP);
+  // Emit code that computes the "bump" from Basis to C. If the candidate is a
+  // GEP and the bump is not divisible by the element size of the GEP, this
+  // function sets the BumpWithUglyGEP flag to notify its caller to bump the
+  // basis using an ugly GEP.
+  static Value *emitBump(const Candidate &Basis, const Candidate &C,
+                         IRBuilder<> &Builder, const DataLayout *DL,
+                         bool &BumpWithUglyGEP);
 
+  const DataLayout *DL;
   DominatorTree *DT;
+  ScalarEvolution *SE;
+  TargetTransformInfo *TTI;
   ilist<Candidate> Candidates;
   // Temporarily holds all instructions that are unlinked (but not deleted) by
   // rewriteCandidateWithBasis. These instructions will be actually removed
@@ -129,6 +195,8 @@ char StraightLineStrengthReduce::ID = 0;
 INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr",
                       "Straight line strength reduction", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr",
                     "Straight line strength reduction", false, false)
 
@@ -141,9 +209,47 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
   return (Basis.Ins != C.Ins && // skip the same instruction
           // Basis must dominate C in order to rewrite C with respect to Basis.
           DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
-          // They share the same base and stride.
+          // They share the same base, stride, and candidate kind.
           Basis.Base == C.Base &&
-          Basis.Stride == C.Stride);
+          Basis.Stride == C.Stride &&
+          Basis.CandidateKind == C.CandidateKind);
+}
+
+static bool isCompletelyFoldable(GetElementPtrInst *GEP,
+                                 const TargetTransformInfo *TTI,
+                                 const DataLayout *DL) {
+  GlobalVariable *BaseGV = nullptr;
+  int64_t BaseOffset = 0;
+  bool HasBaseReg = false;
+  int64_t Scale = 0;
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand()))
+    BaseGV = GV;
+  else
+    HasBaseReg = true;
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      int64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+      if (ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I)) {
+        BaseOffset += ConstIdx->getSExtValue() * ElementSize;
+      } else {
+        // Needs scale register.
+        if (Scale != 0) {
+          // No addressing mode takes two scale registers.
+          return false;
+        }
+        Scale = ElementSize;
+      }
+    } else {
+      StructType *STy = cast<StructType>(*GTI);
+      uint64_t Field = cast<ConstantInt>(*I)->getZExtValue();
+      BaseOffset += DL->getStructLayout(STy)->getElementOffset(Field);
+    }
+  }
+  return TTI->isLegalAddressingMode(GEP->getType()->getElementType(), BaseGV,
+                                    BaseOffset, HasBaseReg, Scale);
 }
 
 // TODO: We currently implement an algorithm whose time complexity is linear to
@@ -153,11 +259,17 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
 // table is indexed by the base and the stride of a candidate.  Therefore,
 // finding the immediate basis of a candidate boils down to one hash-table look
 // up.
-void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B,
-                                                               ConstantInt *Idx,
-                                                               Value *S,
-                                                               Instruction *I) {
-  Candidate C(B, Idx, S, I);
+void StraightLineStrengthReduce::allocateCandidateAndFindBasis(
+    Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+    Instruction *I) {
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    // If &B[Idx * S] fits into an addressing mode, do not turn it into
+    // non-free computation.
+    if (isCompletelyFoldable(GEP, TTI, DL))
+      return;
+  }
+
+  Candidate C(CT, B, Idx, S, I);
   // Try to compute the immediate basis of C.
   unsigned NumIterations = 0;
   // Limit the scan radius to avoid running forever.
@@ -176,60 +288,209 @@ void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Value *B,
 }
 
 void StraightLineStrengthReduce::allocateCandidateAndFindBasis(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Mul:
+    allocateCandidateAndFindBasisForMul(I);
+    break;
+  case Instruction::GetElementPtr:
+    allocateCandidateAndFindBasisForGEP(cast<GetElementPtrInst>(I));
+    break;
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidateAndFindBasisForMul(
+    Value *LHS, Value *RHS, Instruction *I) {
   Value *B = nullptr;
   ConstantInt *Idx = nullptr;
-  // "(Base + Index) * Stride" must be a Mul instruction at the first hand.
-  if (I->getOpcode() == Instruction::Mul) {
-    if (IntegerType *ITy = dyn_cast<IntegerType>(I->getType())) {
-      Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
-      for (unsigned Swapped = 0; Swapped < 2; ++Swapped) {
-        // Only handle the canonical operand ordering.
-        if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) {
-          // If LHS is in the form of "Base + Index", then I is in the form of
-          // "(Base + Index) * RHS".
-          allocateCandidateAndFindBasis(B, Idx, RHS, I);
-        } else {
-          // Otherwise, at least try the form (LHS + 0) * RHS.
-          allocateCandidateAndFindBasis(LHS, ConstantInt::get(ITy, 0), RHS, I);
-        }
-        // Swap LHS and RHS so that we also cover the cases where LHS is the
-        // stride.
-        if (LHS == RHS)
-          break;
-        std::swap(LHS, RHS);
-      }
-    }
+  // Only handle the canonical operand ordering.
+  if (match(LHS, m_Add(m_Value(B), m_ConstantInt(Idx)))) {
+    // If LHS is in the form of "Base + Index", then I is in the form of
+    // "(Base + Index) * RHS".
+    allocateCandidateAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+  } else {
+    // Otherwise, at least try the form (LHS + 0) * RHS.
+    ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0);
+    allocateCandidateAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS,
+                                  I);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidateAndFindBasisForMul(
+    Instruction *I) {
+  // Try matching (B + i) * S.
+  // TODO: we could extend SLSR to float and vector types.
+  if (!isa<IntegerType>(I->getType()))
+    return;
+
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  allocateCandidateAndFindBasisForMul(LHS, RHS, I);
+  if (LHS != RHS) {
+    // Symmetrically, try to split RHS to Base + Index.
+    allocateCandidateAndFindBasisForMul(RHS, LHS, I);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidateAndFindBasisForGEP(
+    const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
+    Instruction *I) {
+  // I = B + sext(Idx *nsw S) *nsw ElementSize
+  //   = B + (sext(Idx) * ElementSize) * sext(S)
+  // Casting to IntegerType is safe because we skipped vector GEPs.
+  IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType()));
+  ConstantInt *ScaledIdx = ConstantInt::get(
+      IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
+  allocateCandidateAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
+}
+
+void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
+                                                  const SCEV *Base,
+                                                  uint64_t ElementSize,
+                                                  GetElementPtrInst *GEP) {
+  // At least, ArrayIdx = ArrayIdx *s 1.
+  allocateCandidateAndFindBasisForGEP(
+      Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
+      ArrayIdx, ElementSize, GEP);
+  Value *LHS = nullptr;
+  ConstantInt *RHS = nullptr;
+  // TODO: handle shl. e.g., we could treat (S << 2) as (S * 4).
+  //
+  // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
+  // itself. This would allow us to handle the shl case for free. However,
+  // matching SCEVs has two issues:
+  //
+  // 1. this would complicate rewriting because the rewriting procedure
+  // would have to translate SCEVs back to IR instructions. This translation
+  // is difficult when LHS is further evaluated to a composite SCEV.
+  //
+  // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
+  // to strip nsw/nuw flags which are critical for SLSR to trace into
+  // sext'ed multiplication.
+  if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
+    // SLSR is currently unsafe if i * S may overflow.
+    // GEP = Base + sext(LHS *nsw RHS) *nsw ElementSize
+    allocateCandidateAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
+  }
+}
+
+void StraightLineStrengthReduce::allocateCandidateAndFindBasisForGEP(
+    GetElementPtrInst *GEP) {
+  // TODO: handle vector GEPs
+  if (GEP->getType()->isVectorTy())
+    return;
+
+  const SCEV *GEPExpr = SE->getSCEV(GEP);
+  Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) {
+    if (!isa<SequentialType>(*GTI++))
+      continue;
+    Value *ArrayIdx = *I;
+    // Compute the byte offset of this index.
+    uint64_t ElementSize = DL->getTypeAllocSize(*GTI);
+    const SCEV *ElementSizeExpr = SE->getSizeOfExpr(IntPtrTy, *GTI);
+    const SCEV *ArrayIdxExpr = SE->getSCEV(ArrayIdx);
+    ArrayIdxExpr = SE->getTruncateOrSignExtend(ArrayIdxExpr, IntPtrTy);
+    const SCEV *LocalOffset =
+        SE->getMulExpr(ArrayIdxExpr, ElementSizeExpr, SCEV::FlagNSW);
+    // The base of this candidate equals GEPExpr less the byte offset of this
+    // index.
+    const SCEV *Base = SE->getMinusSCEV(GEPExpr, LocalOffset);
+    factorArrayIndex(ArrayIdx, Base, ElementSize, GEP);
+    // When ArrayIdx is the sext of a value, we try to factor that value as
+    // well.  Handling this case is important because array indices are
+    // typically sign-extended to the pointer size.
+    Value *TruncatedArrayIdx = nullptr;
+    if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))))
+      factorArrayIndex(TruncatedArrayIdx, Base, ElementSize, GEP);
   }
 }
 
+// A helper function that unifies the bitwidth of A and B.
+static void unifyBitWidth(APInt &A, APInt &B) {
+  if (A.getBitWidth() < B.getBitWidth())
+    A = A.sext(B.getBitWidth());
+  else if (A.getBitWidth() > B.getBitWidth())
+    B = B.sext(A.getBitWidth());
+}
+
+Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
+                                            const Candidate &C,
+                                            IRBuilder<> &Builder,
+                                            const DataLayout *DL,
+                                            bool &BumpWithUglyGEP) {
+  APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
+  unifyBitWidth(Idx, BasisIdx);
+  APInt IndexOffset = Idx - BasisIdx;
+
+  BumpWithUglyGEP = false;
+  if (Basis.CandidateKind == Candidate::GEP) {
+    APInt ElementSize(
+        IndexOffset.getBitWidth(),
+        DL->getTypeAllocSize(
+            cast<GetElementPtrInst>(Basis.Ins)->getType()->getElementType()));
+    APInt Q, R;
+    APInt::sdivrem(IndexOffset, ElementSize, Q, R);
+    if (R.getSExtValue() == 0)
+      IndexOffset = Q;
+    else
+      BumpWithUglyGEP = true;
+  }
+  // Compute Bump = C - Basis = (i' - i) * S.
+  // Common case 1: if (i' - i) is 1, Bump = S.
+  if (IndexOffset.getSExtValue() == 1)
+    return C.Stride;
+  // Common case 2: if (i' - i) is -1, Bump = -S.
+  if (IndexOffset.getSExtValue() == -1)
+    return Builder.CreateNeg(C.Stride);
+  // Otherwise, Bump = (i' - i) * sext/trunc(S).
+  ConstantInt *Delta = ConstantInt::get(Basis.Ins->getContext(), IndexOffset);
+  Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, Delta->getType());
+  return Builder.CreateMul(ExtendedStride, Delta);
+}
+
 void StraightLineStrengthReduce::rewriteCandidateWithBasis(
     const Candidate &C, const Candidate &Basis) {
+  assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
+         C.Stride == Basis.Stride);
+
   // An instruction can correspond to multiple candidates. Therefore, instead of
   // simply deleting an instruction when we rewrite it, we mark its parent as
   // nullptr (i.e. unlink it) so that we can skip the candidates whose
   // instruction is already rewritten.
   if (!C.Ins->getParent())
     return;
-  assert(C.Base == Basis.Base && C.Stride == Basis.Stride);
-  // Basis = (B + i) * S
-  // C     = (B + i') * S
-  //   ==>
-  // C     = Basis + (i' - i) * S
+
   IRBuilder<> Builder(C.Ins);
-  ConstantInt *IndexOffset = ConstantInt::get(
-      C.Ins->getContext(), C.Index->getValue() - Basis.Index->getValue());
-  Value *Reduced;
-  // TODO: preserve nsw/nuw in some cases.
-  if (IndexOffset->isOne()) {
-    // If (i' - i) is 1, fold C into Basis + S.
-    Reduced = Builder.CreateAdd(Basis.Ins, C.Stride);
-  } else if (IndexOffset->isMinusOne()) {
-    // If (i' - i) is -1, fold C into Basis - S.
-    Reduced = Builder.CreateSub(Basis.Ins, C.Stride);
-  } else {
-    Value *Bump = Builder.CreateMul(C.Stride, IndexOffset);
+  bool BumpWithUglyGEP;
+  Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP);
+  Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
+  switch (C.CandidateKind) {
+  case Candidate::Mul:
     Reduced = Builder.CreateAdd(Basis.Ins, Bump);
-  }
+    break;
+  case Candidate::GEP:
+    {
+      Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
+      if (BumpWithUglyGEP) {
+        // C = (char *)Basis + Bump
+        unsigned AS = Basis.Ins->getType()->getPointerAddressSpace();
+        Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS);
+        Reduced = Builder.CreateBitCast(Basis.Ins, CharTy);
+        // We only considered inbounds GEP as candidates.
+        Reduced = Builder.CreateInBoundsGEP(Reduced, Bump);
+        Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType());
+      } else {
+        // C = gep Basis, Bump
+        // Canonicalize bump to pointer size.
+        Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy);
+        Reduced = Builder.CreateInBoundsGEP(Basis.Ins, Bump);
+      }
+    }
+    break;
+  default:
+    llvm_unreachable("C.CandidateKind is invalid");
+  };
   Reduced->takeName(C.Ins);
   C.Ins->replaceAllUsesWith(Reduced);
   C.Ins->dropAllReferences();
@@ -243,15 +504,15 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
 
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  SE = &getAnalysis<ScalarEvolution>();
   // Traverse the dominator tree in the depth-first order. This order makes sure
   // all bases of a candidate are in Candidates when we process it.
   for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT);
        node != GraphTraits<DominatorTree *>::nodes_end(DT); ++node) {
-    BasicBlock *B = node->getBlock();
-    for (auto I = B->begin(); I != B->end(); ++I) {
-      allocateCandidateAndFindBasis(I);
-    }
+    for (auto &I : *node->getBlock())
+      allocateCandidateAndFindBasis(&I);
   }
 
   // Rewrite candidates in the reverse depth-first order. This order makes sure
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index aaf6f9a..6c3ce58 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
@@ -18,6 +18,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 715ddeb..9eef132 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -54,8 +54,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
@@ -87,7 +87,6 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 namespace {
   struct TailCallElim : public FunctionPass {
     const TargetTransformInfo *TTI;
-    const DataLayout *DL;
 
     static char ID; // Pass identification, replacement for typeid
     TailCallElim() : FunctionPass(ID) {
@@ -159,8 +158,6 @@ bool TailCallElim::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
 
-  DL = F.getParent()->getDataLayout();
-
   bool AllCallsAreTailCalls = false;
   bool Modified = markTails(F, AllCallsAreTailCalls);
   if (AllCallsAreTailCalls)
@@ -392,10 +389,9 @@ bool TailCallElim::runTRE(Function &F) {
   SmallVector<PHINode*, 8> ArgumentPHIs;
   bool MadeChange = false;
 
-  // CanTRETailMarkedCall - If false, we cannot perform TRE on tail calls
-  // marked with the 'tail' attribute, because doing so would cause the stack
-  // size to increase (real TRE would deallocate variable sized allocas, TRE
-  // doesn't).
+  // If false, we cannot perform TRE on tail calls marked with the 'tail'
+  // attribute, because doing so would cause the stack size to increase (real
+  // TRE would deallocate variable sized allocas, TRE doesn't).
   bool CanTRETailMarkedCall = CanTRE(F);
 
   // Change any tail recursive calls to loops.
@@ -404,28 +400,19 @@ bool TailCallElim::runTRE(Function &F) {
   // alloca' is changed from being a static alloca to being a dynamic alloca.
   // Until this is resolved, disable this transformation if that would ever
   // happen.  This bug is PR962.
-  SmallVector<BasicBlock*, 8> BBToErase;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
+    BasicBlock *BB = BBI++; // FoldReturnAndProcessPred may delete BB.
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
       bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
                                           ArgumentPHIs, !CanTRETailMarkedCall);
-      if (!Change && BB->getFirstNonPHIOrDbg() == Ret) {
+      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
         Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
                                           TailCallsAreMarkedTail, ArgumentPHIs,
                                           !CanTRETailMarkedCall);
-        // FoldReturnAndProcessPred may have emptied some BB. Remember to
-        // erase them.
-        if (Change && BB->empty())
-          BBToErase.push_back(BB);
-
-      }
       MadeChange |= Change;
     }
   }
 
-  for (auto BB: BBToErase)
-    BB->eraseFromParent();
-
   // If we eliminated any tail recursions, it's possible that we inserted some
   // silly PHI nodes which just merge an initial value (the incoming operand)
   // with themselves.  Check to see if we did and clean up our mess if so.  This
@@ -435,7 +422,7 @@ bool TailCallElim::runTRE(Function &F) {
     PHINode *PN = ArgumentPHIs[i];
 
     // If the PHI Node is a dynamic constant, replace it with the value it is.
-    if (Value *PNV = SimplifyInstruction(PN)) {
+    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
       PN->replaceAllUsesWith(PNV);
       PN->eraseFromParent();
     }
@@ -445,7 +432,7 @@ bool TailCallElim::runTRE(Function &F) {
 }
 
 
-/// CanMoveAboveCall - Return true if it is safe to move the specified
+/// Return true if it is safe to move the specified
 /// instruction from after the call to before the call, assuming that all
 /// instructions between the call and this instruction are movable.
 ///
@@ -464,7 +451,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
       // being loaded from.
       if (CI->mayWriteToMemory() ||
           !isSafeToLoadUnconditionally(L->getPointerOperand(), L,
-                                       L->getAlignment(), DL))
+                                       L->getAlignment()))
         return false;
     }
   }
@@ -480,13 +467,11 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
   return true;
 }
 
-// isDynamicConstant - Return true if the specified value is the same when the
-// return would exit as it was when the initial iteration of the recursive
-// function was executed.
-//
-// We currently handle static constants and arguments that are not modified as
-// part of the recursion.
-//
+/// Return true if the specified value is the same when the return would exit
+/// as it was when the initial iteration of the recursive function was executed.
+///
+/// We currently handle static constants and arguments that are not modified as
+/// part of the recursion.
 static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
   if (isa<Constant>(V)) return true; // Static constants are always dyn consts
 
@@ -518,10 +503,9 @@ static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
   return false;
 }
 
-// getCommonReturnValue - Check to see if the function containing the specified
-// tail call consistently returns the same runtime-constant value at all exit
-// points except for IgnoreRI.  If so, return the returned value.
-//
+/// Check to see if the function containing the specified tail call consistently
+/// returns the same runtime-constant value at all exit points except for
+/// IgnoreRI. If so, return the returned value.
 static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
   Function *F = CI->getParent()->getParent();
   Value *ReturnedValue = nullptr;
@@ -545,10 +529,9 @@ static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
   return ReturnedValue;
 }
 
-/// CanTransformAccumulatorRecursion - If the specified instruction can be
-/// transformed using accumulator recursion elimination, return the constant
-/// which is the start of the accumulator value.  Otherwise return null.
-///
+/// If the specified instruction can be transformed using accumulator recursion
+/// elimination, return the constant which is the start of the accumulator
+/// value.  Otherwise return null.
 Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
                                                       CallInst *CI) {
   if (!I->isAssociative() || !I->isCommutative()) return nullptr;
@@ -836,14 +819,11 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
       ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
 
       // Cleanup: if all predecessors of BB have been eliminated by
-      // FoldReturnIntoUncondBranch, we would like to delete it, but we
-      // can not just nuke it as it is being used as an iterator by our caller.
-      // Just empty it, and the caller will erase it when it is safe to do so.
-      // It is important to empty it, because the ret instruction in there is
-      // still using a value which EliminateRecursiveTailCall will attempt
-      // to remove.
+      // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
+      // because the ret instruction in there is still using a value which
+      // EliminateRecursiveTailCall will attempt to remove.
       if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
-        BB->getInstList().clear();
+        BB->eraseFromParent();
 
       EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
                                  ArgumentPHIs,
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 762a83f..671cbfe 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -33,7 +33,7 @@ Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) {
 
 /// EmitStrLen - Emit a call to the strlen function to the builder, for the
 /// specified pointer.  This always returns an integer value of size intptr_t.
-Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
+Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strlen))
     return nullptr;
@@ -45,12 +45,9 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
   AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrLen = M->getOrInsertFunction("strlen",
-                                            AttributeSet::get(M->getContext(),
-                                                              AS),
-                                            TD->getIntPtrType(Context),
-                                            B.getInt8PtrTy(),
-                                            nullptr);
+  Constant *StrLen = M->getOrInsertFunction(
+      "strlen", AttributeSet::get(M->getContext(), AS),
+      DL.getIntPtrType(Context), B.getInt8PtrTy(), nullptr);
   CallInst *CI = B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen");
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -62,7 +59,7 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
 /// specified pointer.  Ptr is required to be some pointer type, MaxLen must
 /// be of size_t type, and the return value has 'intptr_t' type.
 Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
-                         const DataLayout *TD, const TargetLibraryInfo *TLI) {
+                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strnlen))
     return nullptr;
 
@@ -73,13 +70,10 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
   AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrNLen = M->getOrInsertFunction("strnlen",
-                                             AttributeSet::get(M->getContext(),
-                                                              AS),
-                                             TD->getIntPtrType(Context),
-                                             B.getInt8PtrTy(),
-                                             TD->getIntPtrType(Context),
-                                             nullptr);
+  Constant *StrNLen =
+      M->getOrInsertFunction("strnlen", AttributeSet::get(M->getContext(), AS),
+                             DL.getIntPtrType(Context), B.getInt8PtrTy(),
+                             DL.getIntPtrType(Context), nullptr);
   CallInst *CI = B.CreateCall2(StrNLen, CastToCStr(Ptr, B), MaxLen, "strnlen");
   if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -91,7 +85,7 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
 /// specified pointer and character.  Ptr is required to be some pointer type,
 /// and the return value has 'i8*' type.
 Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
-                        const DataLayout *TD, const TargetLibraryInfo *TLI) {
+                        const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strchr))
     return nullptr;
 
@@ -114,9 +108,8 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
 }
 
 /// EmitStrNCmp - Emit a call to the strncmp function to the builder.
-Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
-                         IRBuilder<> &B, const DataLayout *TD,
-                         const TargetLibraryInfo *TLI) {
+Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strncmp))
     return nullptr;
 
@@ -128,13 +121,9 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
   AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *StrNCmp = M->getOrInsertFunction("strncmp",
-                                          AttributeSet::get(M->getContext(),
-                                                           AS),
-                                          B.getInt32Ty(),
-                                          B.getInt8PtrTy(),
-                                          B.getInt8PtrTy(),
-                                          TD->getIntPtrType(Context), nullptr);
+  Value *StrNCmp = M->getOrInsertFunction(
+      "strncmp", AttributeSet::get(M->getContext(), AS), B.getInt32Ty(),
+      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), nullptr);
   CallInst *CI = B.CreateCall3(StrNCmp, CastToCStr(Ptr1, B),
                                CastToCStr(Ptr2, B), Len, "strncmp");
 
@@ -147,8 +136,7 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
 /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
 /// specified pointer arguments.
 Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
-                        const DataLayout *TD, const TargetLibraryInfo *TLI,
-                        StringRef Name) {
+                        const TargetLibraryInfo *TLI, StringRef Name) {
   if (!TLI->has(LibFunc::strcpy))
     return nullptr;
 
@@ -170,8 +158,7 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
 
 /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the
 /// specified pointer arguments.
-Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
-                         IRBuilder<> &B, const DataLayout *TD,
+Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI, StringRef Name) {
   if (!TLI->has(LibFunc::strncpy))
     return nullptr;
@@ -198,7 +185,7 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
 /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
 /// are pointers.
 Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
-                           IRBuilder<> &B, const DataLayout *TD,
+                           IRBuilder<> &B, const DataLayout &DL,
                            const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcpy_chk))
     return nullptr;
@@ -208,13 +195,10 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
   AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
                          Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
-                                         AttributeSet::get(M->getContext(), AS),
-                                         B.getInt8PtrTy(),
-                                         B.getInt8PtrTy(),
-                                         B.getInt8PtrTy(),
-                                         TD->getIntPtrType(Context),
-                                         TD->getIntPtrType(Context), nullptr);
+  Value *MemCpy = M->getOrInsertFunction(
+      "__memcpy_chk", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(),
+      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
+      DL.getIntPtrType(Context), nullptr);
   Dst = CastToCStr(Dst, B);
   Src = CastToCStr(Src, B);
   CallInst *CI = B.CreateCall4(MemCpy, Dst, Src, Len, ObjSize);
@@ -225,9 +209,8 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
 
 /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
 /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
-Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
-                        Value *Len, IRBuilder<> &B, const DataLayout *TD,
-                        const TargetLibraryInfo *TLI) {
+Value *llvm::EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memchr))
     return nullptr;
 
@@ -236,13 +219,9 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
   AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemChr = M->getOrInsertFunction("memchr",
-                                         AttributeSet::get(M->getContext(), AS),
-                                         B.getInt8PtrTy(),
-                                         B.getInt8PtrTy(),
-                                         B.getInt32Ty(),
-                                         TD->getIntPtrType(Context),
-                                         nullptr);
+  Value *MemChr = M->getOrInsertFunction(
+      "memchr", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(),
+      B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context), nullptr);
   CallInst *CI = B.CreateCall3(MemChr, CastToCStr(Ptr, B), Val, Len, "memchr");
 
   if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts()))
@@ -252,9 +231,8 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
 }
 
 /// EmitMemCmp - Emit a call to the memcmp function.
-Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
-                        Value *Len, IRBuilder<> &B, const DataLayout *TD,
-                        const TargetLibraryInfo *TLI) {
+Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcmp))
     return nullptr;
 
@@ -266,12 +244,9 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
   AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCmp = M->getOrInsertFunction("memcmp",
-                                         AttributeSet::get(M->getContext(), AS),
-                                         B.getInt32Ty(),
-                                         B.getInt8PtrTy(),
-                                         B.getInt8PtrTy(),
-                                         TD->getIntPtrType(Context), nullptr);
+  Value *MemCmp = M->getOrInsertFunction(
+      "memcmp", AttributeSet::get(M->getContext(), AS), B.getInt32Ty(),
+      B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context), nullptr);
   CallInst *CI = B.CreateCall3(MemCmp, CastToCStr(Ptr1, B), CastToCStr(Ptr2, B),
                                Len, "memcmp");
 
@@ -339,7 +314,7 @@ Value *llvm::EmitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
 
 /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
 /// is an integer.
-Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD,
+Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::putchar))
     return nullptr;
@@ -361,7 +336,7 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD,
 
 /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
 /// some pointer.
-Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
+Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B,
                       const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::puts))
     return nullptr;
@@ -386,7 +361,7 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
 /// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
 /// an integer and File is a pointer to FILE.
 Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
-                       const DataLayout *TD, const TargetLibraryInfo *TLI) {
+                       const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputc))
     return nullptr;
 
@@ -419,7 +394,7 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
 /// pointer and File is a pointer to FILE.
 Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
-                       const DataLayout *TD, const TargetLibraryInfo *TLI) {
+                       const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputs))
     return nullptr;
 
@@ -450,9 +425,8 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
 
 /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
 /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
-Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
-                        IRBuilder<> &B, const DataLayout *TD,
-                        const TargetLibraryInfo *TLI) {
+Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
+                        const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fwrite))
     return nullptr;
 
@@ -466,21 +440,18 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FWriteName,
-                               AttributeSet::get(M->getContext(), AS),
-                               TD->getIntPtrType(Context),
-                               B.getInt8PtrTy(),
-                               TD->getIntPtrType(Context),
-                               TD->getIntPtrType(Context),
-                               File->getType(), nullptr);
+    F = M->getOrInsertFunction(
+        FWriteName, AttributeSet::get(M->getContext(), AS),
+        DL.getIntPtrType(Context), B.getInt8PtrTy(), DL.getIntPtrType(Context),
+        DL.getIntPtrType(Context), File->getType(), nullptr);
   else
-    F = M->getOrInsertFunction(FWriteName, TD->getIntPtrType(Context),
-                               B.getInt8PtrTy(),
-                               TD->getIntPtrType(Context),
-                               TD->getIntPtrType(Context),
-                               File->getType(), nullptr);
-  CallInst *CI = B.CreateCall4(F, CastToCStr(Ptr, B), Size,
-                        ConstantInt::get(TD->getIntPtrType(Context), 1), File);
+    F = M->getOrInsertFunction(FWriteName, DL.getIntPtrType(Context),
+                               B.getInt8PtrTy(), DL.getIntPtrType(Context),
+                               DL.getIntPtrType(Context), File->getType(),
+                               nullptr);
+  CallInst *CI =
+      B.CreateCall4(F, CastToCStr(Ptr, B), Size,
+                    ConstantInt::get(DL.getIntPtrType(Context), 1), File);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 09279b6..f04ea9c 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -34,7 +34,7 @@
 #include <map>
 using namespace llvm;
 
-// CloneBasicBlock - See comments in Cloning.h
+/// See comments in Cloning.h.
 BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
                                   ValueToValueMapTy &VMap,
                                   const Twine &NameSuffix, Function *F,
@@ -202,7 +202,7 @@ static void CloneDebugInfoMetadata(Function *NewFunc, const Function *OldFunc,
   }
 }
 
-/// CloneFunction - Return a copy of the specified function, but without
+/// Return a copy of the specified function, but without
 /// embedding the function into another module.  Also, any references specified
 /// in the VMap are changed to refer to their mapped value instead of the
 /// original one.  If any of the arguments to the function are in the VMap,
@@ -250,8 +250,7 @@ Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
 
 
 namespace {
-  /// PruningFunctionCloner - This class is a private class used to implement
-  /// the CloneAndPruneFunctionInto method.
+  /// This is a private class used to implement CloneAndPruneFunctionInto.
   struct PruningFunctionCloner {
     Function *NewFunc;
     const Function *OldFunc;
@@ -259,23 +258,18 @@ namespace {
     bool ModuleLevelChanges;
     const char *NameSuffix;
     ClonedCodeInfo *CodeInfo;
-    const DataLayout *DL;
     CloningDirector *Director;
     ValueMapTypeRemapper *TypeMapper;
     ValueMaterializer *Materializer;
 
   public:
     PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
-                          ValueToValueMapTy &valueMap,
-                          bool moduleLevelChanges,
-                          const char *nameSuffix, 
-                          ClonedCodeInfo *codeInfo,
-                          const DataLayout *DL,
+                          ValueToValueMapTy &valueMap, bool moduleLevelChanges,
+                          const char *nameSuffix, ClonedCodeInfo *codeInfo,
                           CloningDirector *Director)
-    : NewFunc(newFunc), OldFunc(oldFunc),
-      VMap(valueMap), ModuleLevelChanges(moduleLevelChanges),
-      NameSuffix(nameSuffix), CodeInfo(codeInfo), DL(DL),
-      Director(Director) {
+        : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap),
+          ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix),
+          CodeInfo(codeInfo), Director(Director) {
       // These are optional components.  The Director may return null.
       if (Director) {
         TypeMapper = Director->getTypeRemapper();
@@ -286,7 +280,7 @@ namespace {
       }
     }
 
-    /// CloneBlock - The specified block is found to be reachable, clone it and
+    /// The specified block is found to be reachable, clone it and
     /// anything that it can reach.
     void CloneBlock(const BasicBlock *BB, 
                     BasicBlock::const_iterator StartingInst,
@@ -294,7 +288,7 @@ namespace {
   };
 }
 
-/// CloneBlock - The specified block is found to be reachable, clone it and
+/// The specified block is found to be reachable, clone it and
 /// anything that it can reach.
 void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
                                        BasicBlock::const_iterator StartingInst,
@@ -360,7 +354,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       // If we can simplify this instruction to some other value, simply add
       // a mapping to that value rather than inserting a new instruction into
       // the basic block.
-      if (Value *V = SimplifyInstruction(NewInst, DL)) {
+      if (Value *V =
+              SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
         // On the off-chance that this simplifies to an instruction in the old
         // function, map it back into the new function.
         if (Value *MappedV = VMap.lookup(V))
@@ -397,6 +392,14 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     // terminator into the new basic block in this case.
     if (Action == CloningDirector::StopCloningBB)
       return;
+    if (Action == CloningDirector::CloneSuccessors) {
+      // If the director says to skip with a terminate instruction, we still
+      // need to clone this block's successors.
+      const TerminatorInst *TI = BB->getTerminator();
+      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+        ToClone.push_back(TI->getSuccessor(i));
+      return;
+    }
     assert(Action != CloningDirector::SkipInstruction && 
            "SkipInstruction is not valid for terminators.");
   }
@@ -455,10 +458,9 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   }
 }
 
-/// CloneAndPruneIntoFromInst - This works like CloneAndPruneFunctionInto, except
-/// that it does not clone the entire function. Instead it starts at an
-/// instruction provided by the caller and copies (and prunes) only the code 
-/// reachable from that instruction.
+/// This works like CloneAndPruneFunctionInto, except that it does not clone the
+/// entire function. Instead it starts at an instruction provided by the caller
+/// and copies (and prunes) only the code reachable from that instruction.
 void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
                                      const Instruction *StartingInst,
                                      ValueToValueMapTy &VMap,
@@ -466,7 +468,6 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
                                      SmallVectorImpl<ReturnInst *> &Returns,
                                      const char *NameSuffix, 
                                      ClonedCodeInfo *CodeInfo,
-                                     const DataLayout *DL,
                                      CloningDirector *Director) {
   assert(NameSuffix && "NameSuffix cannot be null!");
 
@@ -488,7 +489,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
 #endif
 
   PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
-                            NameSuffix, CodeInfo, DL, Director);
+                            NameSuffix, CodeInfo, Director);
   const BasicBlock *StartingBB;
   if (StartingInst)
     StartingBB = StartingInst->getParent();
@@ -523,11 +524,18 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
 
     // Handle PHI nodes specially, as we have to remove references to dead
     // blocks.
-    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I)
-      if (const PHINode *PN = dyn_cast<PHINode>(I))
-        PHIToResolve.push_back(PN);
-      else
+    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I) {
+      // PHI nodes may have been remapped to non-PHI nodes by the caller or
+      // during the cloning process.
+      if (const PHINode *PN = dyn_cast<PHINode>(I)) {
+        if (isa<PHINode>(VMap[PN]))
+          PHIToResolve.push_back(PN);
+        else
+          break;
+      } else {
         break;
+      }
+    }
 
     // Finally, remap the terminator instructions, as those can't be remapped
     // until all BBs are mapped.
@@ -626,10 +634,10 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // node).
   for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
     if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]]))
-      recursivelySimplifyInstruction(PN, DL);
+      recursivelySimplifyInstruction(PN);
 
   // Now that the inlined function body has been fully constructed, go through
-  // and zap unconditional fall-through branches.  This happen all the time when
+  // and zap unconditional fall-through branches. This happens all the time when
   // specializing code: code specialization turns conditional branches into
   // uncond branches, and this code folds them.
   Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB]);
@@ -680,7 +688,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     // Do not increment I, iteratively merge all things this block branches to.
   }
 
-  // Make a final pass over the basic blocks from theh old function to gather
+  // Make a final pass over the basic blocks from the old function to gather
   // any return instructions which survived folding. We have to do this here
   // because we can iteratively remove and merge returns above.
   for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB]),
@@ -691,7 +699,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
 }
 
 
-/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
+/// This works exactly like CloneFunctionInto,
 /// except that it does some simple constant prop and DCE on the fly.  The
 /// effect of this is to copy significantly less code in cases where (for
 /// example) a function call with constant arguments is inlined, and those
@@ -704,9 +712,8 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                      SmallVectorImpl<ReturnInst*> &Returns,
                                      const char *NameSuffix, 
                                      ClonedCodeInfo *CodeInfo,
-                                     const DataLayout *DL,
                                      Instruction *TheCall) {
-  CloneAndPruneIntoFromInst(NewFunc, OldFunc, OldFunc->front().begin(),
-                            VMap, ModuleLevelChanges, Returns, NameSuffix,
-                            CodeInfo, DL, nullptr);
+  CloneAndPruneIntoFromInst(NewFunc, OldFunc, OldFunc->front().begin(), VMap,
+                            ModuleLevelChanges, Returns, NameSuffix, CodeInfo,
+                            nullptr);
 }
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index e70a7d6..ab89b41 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -332,11 +332,11 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
     DEBUG(dbgs() << **i << ", ");
   DEBUG(dbgs() << ")\n");
 
+  StructType *StructTy;
   if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
-    PointerType *StructPtr =
-           PointerType::getUnqual(StructType::get(M->getContext(), paramTy));
+    StructTy = StructType::get(M->getContext(), paramTy);
     paramTy.clear();
-    paramTy.push_back(StructPtr);
+    paramTy.push_back(PointerType::getUnqual(StructTy));
   }
   FunctionType *funcType =
                   FunctionType::get(RetTy, paramTy, false);
@@ -364,8 +364,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
       TerminatorInst *TI = newFunction->begin()->getTerminator();
-      GetElementPtrInst *GEP = 
-        GetElementPtrInst::Create(AI, Idx, "gep_" + inputs[i]->getName(), TI);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructTy, AI, Idx, "gep_" + inputs[i]->getName(), TI);
       RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI);
     } else
       RewriteVal = AI++;
@@ -447,6 +447,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     }
   }
 
+  StructType *StructArgTy = nullptr;
   AllocaInst *Struct = nullptr;
   if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
     std::vector<Type*> ArgTypes;
@@ -455,7 +456,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       ArgTypes.push_back((*v)->getType());
 
     // Allocate a struct at the beginning of this function
-    Type *StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
+    StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
     Struct =
       new AllocaInst(StructArgTy, nullptr, "structArg",
                      codeReplacer->getParent()->begin()->begin());
@@ -465,9 +466,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
-      GetElementPtrInst *GEP =
-        GetElementPtrInst::Create(Struct, Idx,
-                                  "gep_" + StructValues[i]->getName());
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
       codeReplacer->getInstList().push_back(GEP);
       StoreInst *SI = new StoreInst(StructValues[i], GEP);
       codeReplacer->getInstList().push_back(SI);
@@ -491,9 +491,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
-      GetElementPtrInst *GEP
-        = GetElementPtrInst::Create(Struct, Idx,
-                                    "gep_reload_" + outputs[i]->getName());
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName());
       codeReplacer->getInstList().push_back(GEP);
       Output = GEP;
     } else {
@@ -606,10 +605,9 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
                 Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
                 Idx[1] = ConstantInt::get(Type::getInt32Ty(Context),
                                           FirstOut+out);
-                GetElementPtrInst *GEP =
-                  GetElementPtrInst::Create(OAI, Idx,
-                                            "gep_" + outputs[out]->getName(),
-                                            NTRet);
+                GetElementPtrInst *GEP = GetElementPtrInst::Create(
+                    StructArgTy, OAI, Idx, "gep_" + outputs[out]->getName(),
+                    NTRet);
                 new StoreInst(outputs[out], GEP, NTRet);
               } else {
                 new StoreInst(outputs[out], OAI, NTRet);
diff --git a/lib/Transforms/Utils/CtorUtils.cpp b/lib/Transforms/Utils/CtorUtils.cpp
index 26875e8..dc95089 100644
--- a/lib/Transforms/Utils/CtorUtils.cpp
+++ b/lib/Transforms/Utils/CtorUtils.cpp
@@ -11,14 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/BitVector.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "ctor_utils"
 
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index c2ef1ac..df3e1d4 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -89,7 +89,7 @@ namespace {
       CallerLPad = cast<LandingPadInst>(I);
     }
 
-    /// getOuterResumeDest - The outer unwind destination is the target of
+    /// The outer unwind destination is the target of
     /// unwind edges introduced for calls within the inlined function.
     BasicBlock *getOuterResumeDest() const {
       return OuterResumeDest;
@@ -99,17 +99,16 @@ namespace {
 
     LandingPadInst *getLandingPadInst() const { return CallerLPad; }
 
-    /// forwardResume - Forward the 'resume' instruction to the caller's landing
-    /// pad block. When the landing pad block has only one predecessor, this is
+    /// Forward the 'resume' instruction to the caller's landing pad block.
+    /// When the landing pad block has only one predecessor, this is
     /// a simple branch. When there is more than one predecessor, we need to
     /// split the landing pad block after the landingpad instruction and jump
     /// to there.
     void forwardResume(ResumeInst *RI,
                        SmallPtrSetImpl<LandingPadInst*> &InlinedLPads);
 
-    /// addIncomingPHIValuesFor - Add incoming-PHI values to the unwind
-    /// destination block for the given basic block, using the values for the
-    /// original invoke's source block.
+    /// Add incoming-PHI values to the unwind destination block for the given
+    /// basic block, using the values for the original invoke's source block.
     void addIncomingPHIValuesFor(BasicBlock *BB) const {
       addIncomingPHIValuesForInto(BB, OuterResumeDest);
     }
@@ -124,7 +123,7 @@ namespace {
   };
 }
 
-/// getInnerResumeDest - Get or create a target for the branch from ResumeInsts.
+/// Get or create a target for the branch from ResumeInsts.
 BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
   if (InnerResumeDest) return InnerResumeDest;
 
@@ -159,8 +158,8 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
   return InnerResumeDest;
 }
 
-/// forwardResume - Forward the 'resume' instruction to the caller's landing pad
-/// block. When the landing pad block has only one predecessor, this is a simple
+/// Forward the 'resume' instruction to the caller's landing pad block.
+/// When the landing pad block has only one predecessor, this is a simple
 /// branch. When there is more than one predecessor, we need to split the
 /// landing pad block after the landingpad instruction and jump to there.
 void InvokeInliningInfo::forwardResume(ResumeInst *RI,
@@ -178,9 +177,9 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI,
   RI->eraseFromParent();
 }
 
-/// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into
-/// an invoke, we have to turn all of the calls that can throw into
-/// invokes.  This function analyze BB to see if there are any calls, and if so,
+/// When we inline a basic block into an invoke,
+/// we have to turn all of the calls that can throw into invokes.
+/// This function analyze BB to see if there are any calls, and if so,
 /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
 /// nodes in that block with the values specified in InvokeDestPHIValues.
 static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
@@ -228,7 +227,7 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
   }
 }
 
-/// HandleInlinedInvoke - If we inlined an invoke site, we need to convert calls
+/// If we inlined an invoke site, we need to convert calls
 /// in the body of the inlined function into invokes.
 ///
 /// II is the invoke instruction being inlined.  FirstNewBlock is the first
@@ -279,8 +278,8 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
   InvokeDest->removePredecessor(II->getParent());
 }
 
-/// CloneAliasScopeMetadata - When inlining a function that contains noalias
-/// scope metadata, this metadata needs to be cloned so that the inlined blocks
+/// When inlining a function that contains noalias scope metadata,
+/// this metadata needs to be cloned so that the inlined blocks
 /// have different "unqiue scopes" at every call site. Were this not done, then
 /// aliasing scopes from a function inlined into a caller multiple times could
 /// not be differentiated (and this would lead to miscompiles because the
@@ -391,12 +390,12 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
   }
 }
 
-/// AddAliasScopeMetadata - If the inlined function has noalias arguments, then
-/// add new alias scopes for each noalias argument, tag the mapped noalias
+/// If the inlined function has noalias arguments,
+/// then add new alias scopes for each noalias argument, tag the mapped noalias
 /// parameters with noalias metadata specifying the new scope, and tag all
 /// non-derived loads, stores and memory intrinsics with the new alias scopes.
 static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
-                                  const DataLayout *DL, AliasAnalysis *AA) {
+                                  const DataLayout &DL, AliasAnalysis *AA) {
   if (!EnableNoAliasConversion)
     return;
 
@@ -622,8 +621,9 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
 /// If the inlined function has non-byval align arguments, then
 /// add @llvm.assume-based alignment assumptions to preserve this information.
 static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
-  if (!PreserveAlignmentAssumptions || !IFI.DL)
+  if (!PreserveAlignmentAssumptions)
     return;
+  auto &DL = CS.getCaller()->getParent()->getDataLayout();
 
   // To avoid inserting redundant assumptions, we should check for assumptions
   // already in the caller. To do this, we might need a DT of the caller.
@@ -645,20 +645,20 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
       // If we can already prove the asserted alignment in the context of the
       // caller, then don't bother inserting the assumption.
       Value *Arg = CS.getArgument(I->getArgNo());
-      if (getKnownAlignment(Arg, IFI.DL,
+      if (getKnownAlignment(Arg, DL, CS.getInstruction(),
                             &IFI.ACT->getAssumptionCache(*CalledFunc),
-                            CS.getInstruction(), &DT) >= Align)
+                            &DT) >= Align)
         continue;
 
-      IRBuilder<>(CS.getInstruction()).CreateAlignmentAssumption(*IFI.DL, Arg,
-                                                                 Align);
+      IRBuilder<>(CS.getInstruction())
+          .CreateAlignmentAssumption(DL, Arg, Align);
     }
   }
 }
 
-/// UpdateCallGraphAfterInlining - Once we have cloned code over from a callee
-/// into the caller, update the specified callgraph to reflect the changes we
-/// made.  Note that it's possible that not all code was copied over, so only
+/// Once we have cloned code over from a callee into the caller,
+/// update the specified callgraph to reflect the changes we made.
+/// Note that it's possible that not all code was copied over, so only
 /// some edges of the callgraph may remain.
 static void UpdateCallGraphAfterInlining(CallSite CS,
                                          Function::iterator FirstNewBlock,
@@ -693,8 +693,15 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
     // If the call was inlined, but then constant folded, there is no edge to
     // add.  Check for this case.
     Instruction *NewCall = dyn_cast<Instruction>(VMI->second);
-    if (!NewCall) continue;
+    if (!NewCall)
+      continue;
 
+    // We do not treat intrinsic calls like real function calls because we
+    // expect them to become inline code; do not add an edge for an intrinsic.
+    CallSite CS = CallSite(NewCall);
+    if (CS && CS.getCalledFunction() && CS.getCalledFunction()->isIntrinsic())
+      continue;
+    
     // Remember that this call site got inlined for the client of
     // InlineFunction.
     IFI.InlinedCalls.push_back(NewCall);
@@ -726,11 +733,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
   Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
   IRBuilder<> Builder(InsertBlock->begin());
 
-  Value *Size;
-  if (IFI.DL == nullptr)
-    Size = ConstantExpr::getSizeOf(AggTy);
-  else
-    Size = Builder.getInt64(IFI.DL->getTypeStoreSize(AggTy));
+  Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy));
 
   // Always generate a memcpy of alignment 1 here because we don't know
   // the alignment of the src pointer.  Other optimizations can infer
@@ -738,7 +741,7 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
   Builder.CreateMemCpy(Dst, Src, Size, /*Align=*/1);
 }
 
-/// HandleByValArgument - When inlining a call site that has a byval argument,
+/// When inlining a call site that has a byval argument,
 /// we have to make the implicit memcpy explicit by adding it.
 static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
                                   const Function *CalledFunc,
@@ -759,11 +762,13 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
     if (ByValAlignment <= 1)  // 0 = unspecified, 1 = no particular alignment.
       return Arg;
 
+    const DataLayout &DL = Caller->getParent()->getDataLayout();
+
     // If the pointer is already known to be sufficiently aligned, or if we can
     // round it up to a larger alignment, then we don't need a temporary.
-    if (getOrEnforceKnownAlignment(Arg, ByValAlignment, IFI.DL,
-                                   &IFI.ACT->getAssumptionCache(*Caller),
-                                   TheCall) >= ByValAlignment)
+    if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall,
+                                   &IFI.ACT->getAssumptionCache(*Caller)) >=
+        ByValAlignment)
       return Arg;
     
     // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
@@ -771,10 +776,9 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   }
 
   // Create the alloca.  If we have DataLayout, use nice alignment.
-  unsigned Align = 1;
-  if (IFI.DL)
-    Align = IFI.DL->getPrefTypeAlignment(AggTy);
-  
+  unsigned Align =
+      Caller->getParent()->getDataLayout().getPrefTypeAlignment(AggTy);
+
   // If the byval had an alignment specified, we *must* use at least that
   // alignment, as it is required by the byval argument (and uses of the
   // pointer inside the callee).
@@ -789,8 +793,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   return NewAlloca;
 }
 
-// isUsedByLifetimeMarker - Check whether this Value is used by a lifetime
-// intrinsic.
+// Check whether this Value is used by a lifetime intrinsic.
 static bool isUsedByLifetimeMarker(Value *V) {
   for (User *U : V->users()) {
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
@@ -805,7 +808,7 @@ static bool isUsedByLifetimeMarker(Value *V) {
   return false;
 }
 
-// hasLifetimeMarkers - Check whether the given alloca already has
+// Check whether the given alloca already has
 // lifetime.start or lifetime.end intrinsics.
 static bool hasLifetimeMarkers(AllocaInst *AI) {
   Type *Ty = AI->getType();
@@ -862,7 +865,7 @@ updateInlinedAtInfo(DebugLoc DL, MDLocation *InlinedAtNode,
   return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx), Last);
 }
 
-/// fixupLineNumbers - Update inlined instructions' line numbers to 
+/// Update inlined instructions' line numbers to
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
                              Instruction *TheCall) {
@@ -920,10 +923,9 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   }
 }
 
-/// InlineFunction - This function inlines the called function into the basic
-/// block of the caller.  This returns false if it is not possible to inline
-/// this call.  The program is still in a well defined state if this occurs
-/// though.
+/// This function inlines the called function into the basic block of the
+/// caller. This returns false if it is not possible to inline this call.
+/// The program is still in a well defined state if this occurs though.
 ///
 /// Note that this only does one level of inlining.  For example, if the
 /// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
@@ -1008,6 +1010,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Keep a list of pair (dst, src) to emit byval initializations.
     SmallVector<std::pair<Value*, Value*>, 4> ByValInit;
 
+    auto &DL = Caller->getParent()->getDataLayout();
+
     assert(CalledFunc->arg_size() == CS.arg_size() &&
            "No varargs calls can be inlined!");
 
@@ -1042,9 +1046,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // have no dead or constant instructions leftover after inlining occurs
     // (which can happen, e.g., because an argument was constant), but we'll be
     // happy with whatever the cloner can do.
-    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, 
+    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
                               /*ModuleLevelChanges=*/false, Returns, ".i",
-                              &InlinedFunctionInfo, IFI.DL, TheCall);
+                              &InlinedFunctionInfo, TheCall);
 
     // Remember the first block that is newly cloned over.
     FirstNewBlock = LastBlock; ++FirstNewBlock;
@@ -1065,7 +1069,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     CloneAliasScopeMetadata(CS, VMap);
 
     // Add noalias metadata if necessary.
-    AddAliasScopeMetadata(CS, VMap, IFI.DL, IFI.AA);
+    AddAliasScopeMetadata(CS, VMap, DL, IFI.AA);
 
     // FIXME: We could register any cloned assumptions instead of clearing the
     // whole function's cache.
@@ -1173,18 +1177,17 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       ConstantInt *AllocaSize = nullptr;
       if (ConstantInt *AIArraySize =
           dyn_cast<ConstantInt>(AI->getArraySize())) {
-        if (IFI.DL) {
-          Type *AllocaType = AI->getAllocatedType();
-          uint64_t AllocaTypeSize = IFI.DL->getTypeAllocSize(AllocaType);
-          uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
-          assert(AllocaArraySize > 0 && "array size of AllocaInst is zero");
-          // Check that array size doesn't saturate uint64_t and doesn't
-          // overflow when it's multiplied by type size.
-          if (AllocaArraySize != ~0ULL &&
-              UINT64_MAX / AllocaArraySize >= AllocaTypeSize) {
-            AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
-                                          AllocaArraySize * AllocaTypeSize);
-          }
+        auto &DL = Caller->getParent()->getDataLayout();
+        Type *AllocaType = AI->getAllocatedType();
+        uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+        uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
+        assert(AllocaArraySize > 0 && "array size of AllocaInst is zero");
+        // Check that array size doesn't saturate uint64_t and doesn't
+        // overflow when it's multiplied by type size.
+        if (AllocaArraySize != ~0ULL &&
+            UINT64_MAX / AllocaArraySize >= AllocaTypeSize) {
+          AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
+                                        AllocaArraySize * AllocaTypeSize);
         }
       }
 
@@ -1445,7 +1448,8 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // the entries are the same or undef).  If so, remove the PHI so it doesn't
   // block other optimizations.
   if (PHI) {
-    if (Value *V = SimplifyInstruction(PHI, IFI.DL, nullptr, nullptr,
+    auto &DL = Caller->getParent()->getDataLayout();
+    if (Value *V = SimplifyInstruction(PHI, DL, nullptr, nullptr,
                                        &IFI.ACT->getAssumptionCache(*Caller))) {
       PHI->replaceAllUsesWith(V);
       PHI->eraseFromParent();
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 4830568..bd15f9e 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
@@ -417,7 +417,7 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
 ///
 /// This returns true if it changed the code, note that it can delete
 /// instructions in other blocks as well in this block.
-bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD,
+bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
                                        const TargetLibraryInfo *TLI) {
   bool MadeChange = false;
 
@@ -434,7 +434,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD,
     Instruction *Inst = BI++;
 
     WeakVH BIHandle(BI);
-    if (recursivelySimplifyInstruction(Inst, TD, TLI)) {
+    if (recursivelySimplifyInstruction(Inst, TLI)) {
       MadeChange = true;
       if (BIHandle != BI)
         BI = BB->begin();
@@ -464,8 +464,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD,
 ///
 /// .. and delete the predecessor corresponding to the '1', this will attempt to
 /// recursively fold the and to 0.
-void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
-                                        DataLayout *TD) {
+void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) {
   // This only adjusts blocks with PHI nodes.
   if (!isa<PHINode>(BB->begin()))
     return;
@@ -480,7 +479,7 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
     PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt));
     Value *OldPhiIt = PhiIt;
 
-    if (!recursivelySimplifyInstruction(PN, TD))
+    if (!recursivelySimplifyInstruction(PN))
       continue;
 
     // If recursive simplification ended up deleting the next PHI node we would
@@ -900,13 +899,14 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 /// their preferred alignment from the beginning.
 ///
 static unsigned enforceKnownAlignment(Value *V, unsigned Align,
-                                      unsigned PrefAlign, const DataLayout *TD) {
+                                      unsigned PrefAlign,
+                                      const DataLayout &DL) {
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
     // If the preferred alignment is greater than the natural stack alignment
     // then don't round up. This avoids dynamic stack realignment.
-    if (TD && TD->exceedsNaturalStackAlignment(PrefAlign))
+    if (DL.exceedsNaturalStackAlignment(PrefAlign))
       return Align;
     // If there is a requested alignment and if this is an alloca, round up.
     if (AI->getAlignment() >= PrefAlign)
@@ -945,13 +945,13 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
 /// and it is more than the alignment of the ultimate object, see if we can
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
-                                          const DataLayout *DL,
-                                          AssumptionCache *AC,
+                                          const DataLayout &DL,
                                           const Instruction *CxtI,
+                                          AssumptionCache *AC,
                                           const DominatorTree *DT) {
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
-  unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(V->getType()) : 64;
+  unsigned BitWidth = DL.getPointerTypeSizeInBits(V->getType());
 
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
   computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC, CxtI, DT);
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index a0f8268..90dfaba 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -57,8 +57,10 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -209,10 +211,11 @@ static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
 static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,
                                         DominatorTree *DT,
                                         AssumptionCache *AC) {
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I);
     ++I;
-    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AC)) {
+    if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) {
       // This is a degenerate PHI already, don't modify it!
       PN->replaceAllUsesWith(V);
       if (AA) AA->deleteValue(PN);
@@ -476,7 +479,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
 /// explicit if they accepted the analysis directly and then updated it.
 static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
                             AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
-                            ScalarEvolution *SE, Pass *PP, const DataLayout *DL,
+                            ScalarEvolution *SE, Pass *PP,
                             AssumptionCache *AC) {
   bool Changed = false;
 ReprocessLoop:
@@ -608,13 +611,15 @@ ReprocessLoop:
     }
   }
 
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
   // Scan over the PHI nodes in the loop header.  Since they now have only two
   // incoming values (the loop is canonicalized), we may have simplified the PHI
   // down to 'X = phi [X, Y]', which should be replaced with 'Y'.
   PHINode *PN;
   for (BasicBlock::iterator I = L->getHeader()->begin();
        (PN = dyn_cast<PHINode>(I++)); )
-    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AC)) {
+    if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) {
       if (AA) AA->deleteValue(PN);
       if (SE) SE->forgetValue(PN);
       PN->replaceAllUsesWith(V);
@@ -676,7 +681,8 @@ ReprocessLoop:
       // The block has now been cleared of all instructions except for
       // a comparison and a conditional branch. SimplifyCFG may be able
       // to fold it now.
-      if (!FoldBranchToCommonDest(BI, DL)) continue;
+      if (!FoldBranchToCommonDest(BI))
+        continue;
 
       // Success. The block is now dead, so remove it from the loop,
       // update the dominator tree and delete it.
@@ -714,7 +720,7 @@ ReprocessLoop:
 
 bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
                         AliasAnalysis *AA, ScalarEvolution *SE,
-                        const DataLayout *DL, AssumptionCache *AC) {
+                        AssumptionCache *AC) {
   bool Changed = false;
 
   // Worklist maintains our depth-first queue of loops in this nest to process.
@@ -731,7 +737,7 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
 
   while (!Worklist.empty())
     Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI,
-                               SE, PP, DL, AC);
+                               SE, PP, AC);
 
   return Changed;
 }
@@ -749,7 +755,6 @@ namespace {
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    const DataLayout *DL;
     AssumptionCache *AC;
 
     bool runOnFunction(Function &F) override;
@@ -797,13 +802,11 @@ bool LoopSimplify::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SE = getAnalysisIfAvailable<ScalarEvolution>();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : nullptr;
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL, AC);
+    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, AC);
 
   return Changed;
 }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index accb731..6b3aa02 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -26,8 +26,8 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -500,6 +500,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
   // go.
+  const DataLayout &DL = Header->getModule()->getDataLayout();
   const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
   for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
        BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
@@ -508,7 +509,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
 
       if (isInstructionTriviallyDead(Inst))
         (*BB)->getInstList().erase(Inst);
-      else if (Value *V = SimplifyInstruction(Inst))
+      else if (Value *V = SimplifyInstruction(Inst, DL))
         if (LI->replacementPreservesLCSSAForm(Inst, V)) {
           Inst->replaceAllUsesWith(V);
           (*BB)->getInstList().erase(Inst);
@@ -531,9 +532,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     if (!OuterL && !CompletelyUnroll)
       OuterL = L;
     if (OuterL) {
-      DataLayoutPass *DLP = PP->getAnalysisIfAvailable<DataLayoutPass>();
-      const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL, AC);
+      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, AC);
 
       // LCSSA must be performed on the outermost affected loop. The unrolled
       // loop's last loop latch is guaranteed to be in the outermost loop after
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 91b688c..381d8fc 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
@@ -339,10 +340,11 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
   BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI);
   BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI);
   BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
+  const DataLayout &DL = Header->getModule()->getDataLayout();
 
   // Compute the number of extra iterations required, which is:
   //  extra iterations = run-time trip count % (loop unroll factor + 1)
-  SCEVExpander Expander(*SE, "loop-unroll");
+  SCEVExpander Expander(*SE, DL, "loop-unroll");
   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                             PreHeaderBR);
   Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index b3bdae4..e0e0e90 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -14,17 +14,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/CFG.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include <algorithm>
 using namespace llvm;
@@ -175,11 +175,16 @@ static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
 
     // Remove additional occurences coming from condensed cases and keep the
     // number of incoming values equal to the number of branches to SuccBB.
+    SmallVector<unsigned, 8> Indices;
     for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx)
       if (PN->getIncomingBlock(Idx) == OrigBB) {
-        PN->removeIncomingValue(Idx);
+        Indices.push_back(Idx);
         LocalNumMergedCases--;
       }
+    // Remove incoming values in the reverse order to prevent invalidating
+    // *successive* index.
+    for (auto III = Indices.rbegin(), IIE = Indices.rend(); III != IIE; ++III)
+      PN->removeIncomingValue(*III);
   }
 }
 
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index dabadb7..4b34b19 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -45,6 +45,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <queue>
@@ -667,6 +668,8 @@ void PromoteMem2Reg::run() {
     A->eraseFromParent();
   }
 
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
   // Remove alloca's dbg.declare instrinsics from the function.
   for (unsigned i = 0, e = AllocaDbgDeclares.size(); i != e; ++i)
     if (DbgDeclareInst *DDI = AllocaDbgDeclares[i])
@@ -691,7 +694,7 @@ void PromoteMem2Reg::run() {
       PHINode *PN = I->second;
 
       // If this PHI node merges one value and/or undefs, get the value.
-      if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT, AC)) {
+      if (Value *V = SimplifyInstruction(PN, DL, nullptr, &DT, AC)) {
         if (AST && PN->getType()->isPointerTy())
           AST->deleteValue(PN);
         PN->replaceAllUsesWith(V);
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index c057b06..955ce30 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -155,7 +156,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
 
   // See if the PHI node can be merged to a single value.  This can happen in
   // loop cases when we get a PHI of itself and one other value.
-  if (Value *V = SimplifyInstruction(InsertedPHI)) {
+  if (Value *V =
+          SimplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) {
     InsertedPHI->eraseFromParent();
     return V;
   }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 3248a83..c7c0ca6 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -110,8 +110,8 @@ namespace {
 
 class SimplifyCFGOpt {
   const TargetTransformInfo &TTI;
+  const DataLayout &DL;
   unsigned BonusInstThreshold;
-  const DataLayout *const DL;
   AssumptionCache *AC;
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
@@ -131,9 +131,9 @@ class SimplifyCFGOpt {
   bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
 
 public:
-  SimplifyCFGOpt(const TargetTransformInfo &TTI, unsigned BonusInstThreshold,
-                 const DataLayout *DL, AssumptionCache *AC)
-      : TTI(TTI), BonusInstThreshold(BonusInstThreshold), DL(DL), AC(AC) {}
+  SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
+                 unsigned BonusInstThreshold, AssumptionCache *AC)
+      : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC) {}
   bool run(BasicBlock *BB);
 };
 }
@@ -223,9 +223,9 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
 /// given instruction, which is assumed to be safe to speculate. TCC_Free means
 /// cheap, TCC_Basic means less cheap, and TCC_Expensive means prohibitively
 /// expensive.
-static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL,
+static unsigned ComputeSpeculationCost(const User *I,
                                        const TargetTransformInfo &TTI) {
-  assert(isSafeToSpeculativelyExecute(I, DL) &&
+  assert(isSafeToSpeculativelyExecute(I) &&
          "Instruction is not safe to speculatively execute!");
   return TTI.getUserCost(I);
 }
@@ -249,7 +249,6 @@ static unsigned ComputeSpeculationCost(const User *I, const DataLayout *DL,
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
                                 SmallPtrSetImpl<Instruction*> *AggressiveInsts,
                                 unsigned &CostRemaining,
-                                const DataLayout *DL,
                                 const TargetTransformInfo &TTI) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
@@ -283,10 +282,10 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // Okay, it looks like the instruction IS in the "condition".  Check to
   // see if it's a cheap instruction to unconditionally compute, and if it
   // only uses stuff defined outside of the condition.  If so, hoist it out.
-  if (!isSafeToSpeculativelyExecute(I, DL))
+  if (!isSafeToSpeculativelyExecute(I))
     return false;
 
-  unsigned Cost = ComputeSpeculationCost(I, DL, TTI);
+  unsigned Cost = ComputeSpeculationCost(I, TTI);
 
   if (Cost > CostRemaining)
     return false;
@@ -296,7 +295,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // Okay, we can only really hoist these out if their operands do
   // not take us over the cost threshold.
   for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
-    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, DL, TTI))
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI))
       return false;
   // Okay, it's safe to do this!  Remember this instruction.
   AggressiveInsts->insert(I);
@@ -305,15 +304,15 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
 
 /// GetConstantInt - Extract ConstantInt from value, looking through IntToPtr
 /// and PointerNullValue. Return NULL if value is not a constant int.
-static ConstantInt *GetConstantInt(Value *V, const DataLayout *DL) {
+static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
   // Normal constant int.
   ConstantInt *CI = dyn_cast<ConstantInt>(V);
-  if (CI || !DL || !isa<Constant>(V) || !V->getType()->isPointerTy())
+  if (CI || !isa<Constant>(V) || !V->getType()->isPointerTy())
     return CI;
 
   // This is some kind of pointer constant. Turn it into a pointer-sized
   // ConstantInt if possible.
-  IntegerType *PtrTy = cast<IntegerType>(DL->getIntPtrType(V->getType()));
+  IntegerType *PtrTy = cast<IntegerType>(DL.getIntPtrType(V->getType()));
 
   // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
   if (isa<ConstantPointerNull>(V))
@@ -346,16 +345,16 @@ namespace {
 /// while for a chain of '&&' it will build the set elements that make the test
 /// fail.
 struct ConstantComparesGatherer {
-
+  const DataLayout &DL;
   Value *CompValue; /// Value found for the switch comparison
   Value *Extra;     /// Extra clause to be checked before the switch
   SmallVector<ConstantInt *, 8> Vals; /// Set of integers to match in switch
   unsigned UsedICmps; /// Number of comparisons matched in the and/or chain
 
   /// Construct and compute the result for the comparison instruction Cond
-  ConstantComparesGatherer(Instruction *Cond, const DataLayout *DL)
-      : CompValue(nullptr), Extra(nullptr), UsedICmps(0) {
-    gather(Cond, DL);
+  ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL)
+      : DL(DL), CompValue(nullptr), Extra(nullptr), UsedICmps(0) {
+    gather(Cond);
   }
 
   /// Prevent copy
@@ -380,7 +379,7 @@ private:
   /// against is placed in CompValue.
   /// If CompValue is already set, the function is expected to fail if a match
   /// is found but the value compared to is different.
-  bool matchInstruction(Instruction *I, const DataLayout *DL, bool isEQ) {
+  bool matchInstruction(Instruction *I, bool isEQ) {
     // If this is an icmp against a constant, handle this as one of the cases.
     ICmpInst *ICI;
     ConstantInt *C;
@@ -422,8 +421,8 @@ private:
     }
 
     // If we have "x ult 3", for example, then we can add 0,1,2 to the set.
-    ConstantRange Span = ConstantRange::makeICmpRegion(ICI->getPredicate(),
-                                                       C->getValue());
+    ConstantRange Span = ConstantRange::makeAllowedICmpRegion(
+        ICI->getPredicate(), C->getValue());
 
     // Shift the range if the compare is fed by an add. This is the range
     // compare idiom as emitted by instcombine.
@@ -462,7 +461,7 @@ private:
   /// the value being compared, and stick the list constants into the Vals
   /// vector.
   /// One "Extra" case is allowed to differ from the other.
-  void gather(Value *V, const DataLayout *DL) {
+  void gather(Value *V) {
     Instruction *I = dyn_cast<Instruction>(V);
     bool isEQ = (I->getOpcode() == Instruction::Or);
 
@@ -484,7 +483,7 @@ private:
         }
 
         // Try to match the current instruction
-        if (matchInstruction(I, DL, isEQ))
+        if (matchInstruction(I, isEQ))
           // Match succeed, continue the loop
           continue;
       }
@@ -532,15 +531,16 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
       CV = SI->getCondition();
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
     if (BI->isConditional() && BI->getCondition()->hasOneUse())
-      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition()))
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
         if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), DL))
           CV = ICI->getOperand(0);
+      }
 
   // Unwrap any lossless ptrtoint cast.
-  if (DL && CV) {
+  if (CV) {
     if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) {
       Value *Ptr = PTII->getPointerOperand();
-      if (PTII->getType() == DL->getIntPtrType(Ptr->getType()))
+      if (PTII->getType() == DL.getIntPtrType(Ptr->getType()))
         CV = Ptr;
     }
   }
@@ -981,8 +981,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       Builder.SetInsertPoint(PTI);
       // Convert pointer to int before we switch.
       if (CV->getType()->isPointerTy()) {
-        assert(DL && "Cannot switch on pointer without DataLayout");
-        CV = Builder.CreatePtrToInt(CV, DL->getIntPtrType(CV->getType()),
+        CV = Builder.CreatePtrToInt(CV, DL.getIntPtrType(CV->getType()),
                                     "magicptr");
       }
 
@@ -1053,7 +1052,7 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I);
 /// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and
 /// BB2, hoist any common code in the two blocks up into the branch block.  The
 /// caller of this function guarantees that BI's block dominates BB1 and BB2.
-static bool HoistThenElseCodeToIf(BranchInst *BI, const DataLayout *DL,
+static bool HoistThenElseCodeToIf(BranchInst *BI,
                                   const TargetTransformInfo &TTI) {
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks.  In particular, we don't want to get into
@@ -1145,9 +1144,9 @@ HoistTerminator:
           passingValueIsAlwaysUndefined(BB2V, PN))
        return Changed;
 
-      if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V, DL))
+      if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
         return Changed;
-      if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V, DL))
+      if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V))
         return Changed;
     }
   }
@@ -1467,7 +1466,6 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
 ///
 /// \returns true if the conditional block is removed.
 static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
-                                   const DataLayout *DL,
                                    const TargetTransformInfo &TTI) {
   // Be conservative for now. FP select instruction can often be expensive.
   Value *BrCond = BI->getCondition();
@@ -1511,14 +1509,13 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
       return false;
 
     // Don't hoist the instruction if it's unsafe or expensive.
-    if (!isSafeToSpeculativelyExecute(I, DL) &&
-        !(HoistCondStores &&
-          (SpeculatedStoreValue = isSafeToSpeculateStore(I, BB, ThenBB,
-                                                         EndBB))))
+    if (!isSafeToSpeculativelyExecute(I) &&
+        !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(
+                                  I, BB, ThenBB, EndBB))))
       return false;
     if (!SpeculatedStoreValue &&
-        ComputeSpeculationCost(I, DL, TTI) > PHINodeFoldingThreshold *
-        TargetTransformInfo::TCC_Basic)
+        ComputeSpeculationCost(I, TTI) >
+            PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic)
       return false;
 
     // Store the store speculation candidate.
@@ -1574,11 +1571,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     if (!OrigCE && !ThenCE)
       continue; // Known safe and cheap.
 
-    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE, DL)) ||
-        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE, DL)))
+    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) ||
+        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE)))
       return false;
-    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, DL, TTI) : 0;
-    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, DL, TTI) : 0;
+    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE, TTI) : 0;
+    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE, TTI) : 0;
     unsigned MaxCost = 2 * PHINodeFoldingThreshold *
       TargetTransformInfo::TCC_Basic;
     if (OrigCost + ThenCost > MaxCost)
@@ -1688,7 +1685,7 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
 /// that is defined in the same block as the branch and if any PHI entries are
 /// constants, thread edges corresponding to that entry to be branches to their
 /// ultimate destination.
-static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *DL) {
+static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
   BasicBlock *BB = BI->getParent();
   PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
   // NOTE: we currently cannot transform this case if the PHI node is used
@@ -1786,8 +1783,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *DL) {
 
 /// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry
 /// PHI node, see if we can eliminate it.
-static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL,
-                                const TargetTransformInfo &TTI) {
+static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
+                                const DataLayout &DL) {
   // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
   // statement", which has a very simple dominance structure.  Basically, we
   // are trying to find the condition that is being branched on, which
@@ -1830,9 +1827,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *DL,
     }
 
     if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts,
-                             MaxCostVal0, DL, TTI) ||
+                             MaxCostVal0, TTI) ||
         !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts,
-                             MaxCostVal1, DL, TTI))
+                             MaxCostVal1, TTI))
       return false;
   }
 
@@ -2052,8 +2049,7 @@ static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
 /// FoldBranchToCommonDest - If this basic block is simple enough, and if a
 /// predecessor branches to us and one of our successors, fold the block into
 /// the predecessor and use logical operations to pick the right destination.
-bool llvm::FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL,
-                                  unsigned BonusInstThreshold) {
+bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
   BasicBlock *BB = BI->getParent();
 
   Instruction *Cond = nullptr;
@@ -2109,7 +2105,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL,
     // Ignore dbg intrinsics.
     if (isa<DbgInfoIntrinsic>(I))
       continue;
-    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(I, DL))
+    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(I))
       return false;
     // I has only one use and can be executed unconditionally.
     Instruction *User = dyn_cast<Instruction>(I->user_back());
@@ -2702,8 +2698,9 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
 /// We prefer to split the edge to 'end' so that there is a true/false entry to
 /// the PHI, merging the third icmp into the switch.
 static bool TryToSimplifyUncondBranchWithICmpInIt(
-    ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI,
-    unsigned BonusInstThreshold, const DataLayout *DL, AssumptionCache *AC) {
+    ICmpInst *ICI, IRBuilder<> &Builder, const DataLayout &DL,
+    const TargetTransformInfo &TTI, unsigned BonusInstThreshold,
+    AssumptionCache *AC) {
   BasicBlock *BB = ICI->getParent();
 
   // If the block has any PHIs in it or the icmp has multiple uses, it is too
@@ -2736,7 +2733,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
       ICI->eraseFromParent();
     }
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
   }
 
   // Ok, the block is reachable from the default dest.  If the constant we're
@@ -2752,7 +2749,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
     ICI->replaceAllUsesWith(V);
     ICI->eraseFromParent();
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
   }
 
   // The use of the icmp has to be in the 'end' block, by the only PHI node in
@@ -2808,8 +2805,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
 /// SimplifyBranchOnICmpChain - The specified branch is a conditional branch.
 /// Check to see if it is branching on an or/and chain of icmp instructions, and
 /// fold it into a switch instruction if so.
-static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *DL,
-                                      IRBuilder<> &Builder) {
+static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
+                                      const DataLayout &DL) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (!Cond) return false;
 
@@ -2884,10 +2881,8 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *DL,
   Builder.SetInsertPoint(BI);
   // Convert pointer to int before we switch.
   if (CompVal->getType()->isPointerTy()) {
-    assert(DL && "Cannot switch on pointer without DataLayout");
-    CompVal = Builder.CreatePtrToInt(CompVal,
-                                     DL->getIntPtrType(CompVal->getType()),
-                                     "magicptr");
+    CompVal = Builder.CreatePtrToInt(
+        CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr");
   }
 
   // Create the new switch instruction now.
@@ -3246,8 +3241,8 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
 
 /// EliminateDeadSwitchCases - Compute masked bits for the condition of a switch
 /// and use it to remove dead cases.
-static bool EliminateDeadSwitchCases(SwitchInst *SI, const DataLayout *DL,
-                                     AssumptionCache *AC) {
+static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
+                                     const DataLayout &DL) {
   Value *Cond = SI->getCondition();
   unsigned Bits = Cond->getType()->getIntegerBitWidth();
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
@@ -3398,9 +3393,8 @@ static Constant *LookupConstant(Value *V,
 /// constant or can be replaced by constants from the ConstantPool. Returns the
 /// resulting constant on success, 0 otherwise.
 static Constant *
-ConstantFold(Instruction *I,
-             const SmallDenseMap<Value *, Constant *> &ConstantPool,
-             const DataLayout *DL) {
+ConstantFold(Instruction *I, const DataLayout &DL,
+             const SmallDenseMap<Value *, Constant *> &ConstantPool) {
   if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
     Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
     if (!A)
@@ -3420,9 +3414,10 @@ ConstantFold(Instruction *I,
       return nullptr;
   }
 
-  if (CmpInst *Cmp = dyn_cast<CmpInst>(I))
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
     return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0],
                                            COps[1], DL);
+  }
 
   return ConstantFoldInstOperands(I->getOpcode(), I->getType(), COps, DL);
 }
@@ -3432,12 +3427,10 @@ ConstantFold(Instruction *I,
 /// destionations CaseDest corresponding to value CaseVal (0 for the default
 /// case), of a switch instruction SI.
 static bool
-GetCaseResults(SwitchInst *SI,
-               ConstantInt *CaseVal,
-               BasicBlock *CaseDest,
+GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
                BasicBlock **CommonDest,
-               SmallVectorImpl<std::pair<PHINode *, Constant *> > &Res,
-               const DataLayout *DL) {
+               SmallVectorImpl<std::pair<PHINode *, Constant *>> &Res,
+               const DataLayout &DL) {
   // The block from which we enter the common destination.
   BasicBlock *Pred = SI->getParent();
 
@@ -3456,7 +3449,7 @@ GetCaseResults(SwitchInst *SI,
     } else if (isa<DbgInfoIntrinsic>(I)) {
       // Skip debug intrinsic.
       continue;
-    } else if (Constant *C = ConstantFold(I, ConstantPool, DL)) {
+    } else if (Constant *C = ConstantFold(I, DL, ConstantPool)) {
       // Instruction is side-effect free and constant.
 
       // If the instruction has uses outside this block or a phi node slot for
@@ -3527,11 +3520,11 @@ static void MapCaseToResult(ConstantInt *CaseVal,
 // results for the PHI node of the common destination block for a switch
 // instruction. Returns false if multiple PHI nodes have been found or if
 // there is not a common destination block for the switch.
-static bool InitializeUniqueCases(
-    SwitchInst *SI, const DataLayout *DL, PHINode *&PHI,
-    BasicBlock *&CommonDest,
-    SwitchCaseResultVectorTy &UniqueResults,
-    Constant *&DefaultResult) {
+static bool InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI,
+                                  BasicBlock *&CommonDest,
+                                  SwitchCaseResultVectorTy &UniqueResults,
+                                  Constant *&DefaultResult,
+                                  const DataLayout &DL) {
   for (auto &I : SI->cases()) {
     ConstantInt *CaseVal = I.getCaseValue();
 
@@ -3638,15 +3631,15 @@ static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
 /// phi nodes in a common successor block with only two different
 /// constant values, replace the switch with select.
 static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
-                           const DataLayout *DL, AssumptionCache *AC) {
+                           AssumptionCache *AC, const DataLayout &DL) {
   Value *const Cond = SI->getCondition();
   PHINode *PHI = nullptr;
   BasicBlock *CommonDest = nullptr;
   Constant *DefaultResult;
   SwitchCaseResultVectorTy UniqueResults;
   // Collect all the cases that will deliver the same value from the switch.
-  if (!InitializeUniqueCases(SI, DL, PHI, CommonDest, UniqueResults,
-                             DefaultResult))
+  if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
+                             DL))
     return false;
   // Selects choose between maximum two values.
   if (UniqueResults.size() != 2)
@@ -3673,12 +3666,10 @@ namespace {
     /// SwitchLookupTable - Create a lookup table to use as a switch replacement
     /// with the contents of Values, using DefaultValue to fill any holes in the
     /// table.
-    SwitchLookupTable(Module &M,
-                      uint64_t TableSize,
-                      ConstantInt *Offset,
-             const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values,
-                      Constant *DefaultValue,
-                      const DataLayout *DL);
+    SwitchLookupTable(
+        Module &M, uint64_t TableSize, ConstantInt *Offset,
+        const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+        Constant *DefaultValue, const DataLayout &DL);
 
     /// BuildLookup - Build instructions with Builder to retrieve the value at
     /// the position given by Index in the lookup table.
@@ -3686,8 +3677,7 @@ namespace {
 
     /// WouldFitInRegister - Return true if a table with TableSize elements of
     /// type ElementType would fit in a target-legal register.
-    static bool WouldFitInRegister(const DataLayout *DL,
-                                   uint64_t TableSize,
+    static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
                                    const Type *ElementType);
 
   private:
@@ -3729,12 +3719,10 @@ namespace {
   };
 }
 
-SwitchLookupTable::SwitchLookupTable(Module &M,
-                                     uint64_t TableSize,
-                                     ConstantInt *Offset,
-             const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values,
-                                     Constant *DefaultValue,
-                                     const DataLayout *DL)
+SwitchLookupTable::SwitchLookupTable(
+    Module &M, uint64_t TableSize, ConstantInt *Offset,
+    const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+    Constant *DefaultValue, const DataLayout &DL)
     : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr),
       LinearOffset(nullptr), LinearMultiplier(nullptr), Array(nullptr) {
   assert(Values.size() && "Can't build lookup table without values!");
@@ -3904,11 +3892,9 @@ Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
   llvm_unreachable("Unknown lookup table kind!");
 }
 
-bool SwitchLookupTable::WouldFitInRegister(const DataLayout *DL,
+bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,
                                            uint64_t TableSize,
                                            const Type *ElementType) {
-  if (!DL)
-    return false;
   const IntegerType *IT = dyn_cast<IntegerType>(ElementType);
   if (!IT)
     return false;
@@ -3918,17 +3904,16 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *DL,
   // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
   if (TableSize >= UINT_MAX/IT->getBitWidth())
     return false;
-  return DL->fitsInLegalInteger(TableSize * IT->getBitWidth());
+  return DL.fitsInLegalInteger(TableSize * IT->getBitWidth());
 }
 
 /// ShouldBuildLookupTable - Determine whether a lookup table should be built
 /// for this switch, based on the number of cases, size of the table and the
 /// types of the results.
-static bool ShouldBuildLookupTable(SwitchInst *SI,
-                                   uint64_t TableSize,
-                                   const TargetTransformInfo &TTI,
-                                   const DataLayout *DL,
-                            const SmallDenseMap<PHINode*, Type*>& ResultTypes) {
+static bool
+ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
+                       const TargetTransformInfo &TTI, const DataLayout &DL,
+                       const SmallDenseMap<PHINode *, Type *> &ResultTypes) {
   if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
     return false; // TableSize overflowed, or mul below might overflow.
 
@@ -4051,10 +4036,9 @@ static void reuseTableCompare(User *PhiUser, BasicBlock *PhiBlock,
 /// SwitchToLookupTable - If the switch is only used to initialize one or more
 /// phi nodes in a common successor block with different constant values,
 /// replace the switch with lookup tables.
-static bool SwitchToLookupTable(SwitchInst *SI,
-                                IRBuilder<> &Builder,
-                                const TargetTransformInfo &TTI,
-                                const DataLayout* DL) {
+static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
+                                const DataLayout &DL,
+                                const TargetTransformInfo &TTI) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   // Only build lookup table when we have a target that supports it.
@@ -4125,14 +4109,14 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   // or a bitmask that fits in a register.
   SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
   bool HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(),
-                                       &CommonDest, DefaultResultsList, DL);
+                                          &CommonDest, DefaultResultsList, DL);
 
   bool NeedMask = (TableHasHoles && !HasDefaultResults);
   if (NeedMask) {
     // As an extra penalty for the validity test we require more cases.
     if (SI->getNumCases() < 4)  // FIXME: Find best threshold value (benchmark).
       return false;
-    if (!(DL && DL->fitsInLegalInteger(TableSize)))
+    if (!DL.fitsInLegalInteger(TableSize))
       return false;
   }
 
@@ -4290,12 +4274,12 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
     // see if that predecessor totally determines the outcome of this switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
     Value *Cond = SI->getCondition();
     if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
       if (SimplifySwitchOnSelect(SI, Select))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
     // If the block only contains the switch, see if we can fold the block
     // away into any preds.
@@ -4305,25 +4289,25 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
       ++BBI;
     if (SI == &*BBI)
       if (FoldValueComparisonIntoPredecessors(SI, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
   }
 
   // Try to transform the switch into an icmp and a branch.
   if (TurnSwitchRangeIntoICmp(SI, Builder))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   // Remove unreachable cases.
-  if (EliminateDeadSwitchCases(SI, DL, AC))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+  if (EliminateDeadSwitchCases(SI, AC, DL))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
-  if (SwitchToSelect(SI, Builder, DL, AC))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+  if (SwitchToSelect(SI, Builder, AC, DL))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   if (ForwardSwitchConditionToPHI(SI))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
-  if (SwitchToLookupTable(SI, Builder, TTI, DL))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+  if (SwitchToLookupTable(SI, Builder, DL, TTI))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   return false;
 }
@@ -4360,11 +4344,87 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 
   if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
     if (SimplifyIndirectBrOnSelect(IBI, SI))
-      return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+      return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
   }
   return Changed;
 }
 
+/// Given an block with only a single landing pad and a unconditional branch
+/// try to find another basic block which this one can be merged with.  This
+/// handles cases where we have multiple invokes with unique landing pads, but
+/// a shared handler.
+///
+/// We specifically choose to not worry about merging non-empty blocks
+/// here.  That is a PRE/scheduling problem and is best solved elsewhere.  In
+/// practice, the optimizer produces empty landing pad blocks quite frequently
+/// when dealing with exception dense code.  (see: instcombine, gvn, if-else
+/// sinking in this file)
+///
+/// This is primarily a code size optimization.  We need to avoid performing
+/// any transform which might inhibit optimization (such as our ability to
+/// specialize a particular handler via tail commoning).  We do this by not
+/// merging any blocks which require us to introduce a phi.  Since the same
+/// values are flowing through both blocks, we don't loose any ability to
+/// specialize.  If anything, we make such specialization more likely.
+///
+/// TODO - This transformation could remove entries from a phi in the target
+/// block when the inputs in the phi are the same for the two blocks being
+/// merged.  In some cases, this could result in removal of the PHI entirely.
+static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
+                                 BasicBlock *BB) {
+  auto Succ = BB->getUniqueSuccessor();
+  assert(Succ);
+  // If there's a phi in the successor block, we'd likely have to introduce
+  // a phi into the merged landing pad block.
+  if (isa<PHINode>(*Succ->begin()))
+    return false;
+
+  for (BasicBlock *OtherPred : predecessors(Succ)) {
+    if (BB == OtherPred)
+      continue;
+    BasicBlock::iterator I = OtherPred->begin();
+    LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
+    if (!LPad2 || !LPad2->isIdenticalTo(LPad))
+      continue;
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {}
+    BranchInst *BI2 = dyn_cast<BranchInst>(I);
+    if (!BI2 || !BI2->isIdenticalTo(BI))
+      continue;
+
+    // We've found an identical block.  Update our predeccessors to take that
+    // path instead and make ourselves dead.
+    SmallSet<BasicBlock *, 16> Preds;
+    Preds.insert(pred_begin(BB), pred_end(BB));
+    for (BasicBlock *Pred : Preds) {
+      InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
+      assert(II->getNormalDest() != BB &&
+             II->getUnwindDest() == BB && "unexpected successor");
+      II->setUnwindDest(OtherPred);
+    }
+
+    // The debug info in OtherPred doesn't cover the merged control flow that
+    // used to go through BB.  We need to delete it or update it.
+    for (auto I = OtherPred->begin(), E = OtherPred->end();
+         I != E;) {
+      Instruction &Inst = *I; I++;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        Inst.eraseFromParent();
+    }
+
+    SmallSet<BasicBlock *, 16> Succs;
+    Succs.insert(succ_begin(BB), succ_end(BB));
+    for (BasicBlock *Succ : Succs) {
+      Succ->removePredecessor(BB);
+    }
+
+    IRBuilder<> Builder(BI);
+    Builder.CreateUnreachable();
+    BI->eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
 bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   BasicBlock *BB = BI->getParent();
 
@@ -4384,17 +4444,26 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
       for (++I; isa<DbgInfoIntrinsic>(I); ++I)
         ;
       if (I->isTerminator() &&
-          TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI,
-                                                BonusInstThreshold, DL, AC))
+          TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, DL, TTI,
+                                                BonusInstThreshold, AC))
         return true;
     }
 
+  // See if we can merge an empty landing pad block with another which is
+  // equivalent.
+  if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
+    for (++I; isa<DbgInfoIntrinsic>(I); ++I) {}
+    if (I->isTerminator() &&
+        TryToMergeLandingPad(LPad, BI, BB))
+      return true;
+  }
+
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and our successor, fold the comparison into the
   // predecessor and use logical operations to update the incoming value
   // for PHI nodes in common successor.
-  if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+  if (FoldBranchToCommonDest(BI, BonusInstThreshold))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
   return false;
 }
 
@@ -4409,7 +4478,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     // switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
@@ -4419,26 +4488,26 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       ++I;
     if (&*I == BI) {
       if (FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
     } else if (&*I == cast<Instruction>(BI->getCondition())){
       ++I;
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(I))
         ++I;
       if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
     }
   }
 
   // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
-  if (SimplifyBranchOnICmpChain(BI, DL, Builder))
+  if (SimplifyBranchOnICmpChain(BI, Builder, DL))
     return true;
 
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
-  if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold))
-    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+  if (FoldBranchToCommonDest(BI, BonusInstThreshold))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   // We have a conditional branch to two blocks that are only reachable
   // from BI.  We know that the condbr dominates the two blocks, so see if
@@ -4446,16 +4515,16 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
-      if (HoistThenElseCodeToIf(BI, DL, TTI))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+      if (HoistThenElseCodeToIf(BI, TTI))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
       TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator();
       if (Succ0TI->getNumSuccessors() == 1 &&
           Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
-        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL, TTI))
-          return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI))
+          return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
     }
   } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
@@ -4463,8 +4532,8 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator();
     if (Succ1TI->getNumSuccessors() == 1 &&
         Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
-      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL, TTI))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI))
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
   }
 
   // If this is a branch on a phi node in the current block, thread control
@@ -4472,14 +4541,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
       if (FoldCondBranchOnPHI(BI, DL))
-        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
         if (SimplifyCondBranchToCondBranch(PBI, BI))
-          return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AC) | true;
+          return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   return false;
 }
@@ -4591,7 +4660,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   // eliminate it, do so now.
   if (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
     if (PN->getNumIncomingValues() == 2)
-      Changed |= FoldTwoEntryPHINode(PN, DL, TTI);
+      Changed |= FoldTwoEntryPHINode(PN, TTI, DL);
 
   Builder.SetInsertPoint(BB->getTerminator());
   if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
@@ -4623,7 +4692,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 /// of the CFG.  It returns true if a modification was made.
 ///
 bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
-                       unsigned BonusInstThreshold, const DataLayout *DL,
-                       AssumptionCache *AC) {
-  return SimplifyCFGOpt(TTI, BonusInstThreshold, DL, AC).run(BB);
+                       unsigned BonusInstThreshold, AssumptionCache *AC) {
+  return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(),
+                        BonusInstThreshold, AC).run(BB);
 }
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 6a5d885..8bfc5fb 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -270,95 +270,57 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
 bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
                                                     Value *IVOperand) {
 
-  // Currently we only handle instructions of the form "add <indvar> <value>"
-  unsigned Op = BO->getOpcode();
-  if (Op != Instruction::Add)
+  // Fastpath: we don't have any work to do if `BO` is `nuw` and `nsw`.
+  if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap())
     return false;
 
-  // If BO is already both nuw and nsw then there is nothing left to do
-  if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap())
+  const SCEV *(ScalarEvolution::*GetExprForBO)(const SCEV *, const SCEV *,
+                                               SCEV::NoWrapFlags);
+
+  switch (BO->getOpcode()) {
+  default:
     return false;
 
-  IntegerType *IT = cast<IntegerType>(IVOperand->getType());
-  Value *OtherOperand = nullptr;
-  if (BO->getOperand(0) == IVOperand) {
-    OtherOperand = BO->getOperand(1);
-  } else {
-    assert(BO->getOperand(1) == IVOperand && "only other use!");
-    OtherOperand = BO->getOperand(0);
+  case Instruction::Add:
+    GetExprForBO = &ScalarEvolution::getAddExpr;
+    break;
+
+  case Instruction::Sub:
+    GetExprForBO = &ScalarEvolution::getMinusSCEV;
+    break;
+
+  case Instruction::Mul:
+    GetExprForBO = &ScalarEvolution::getMulExpr;
+    break;
   }
 
-  bool Changed = false;
-  const SCEV *OtherOpSCEV = SE->getSCEV(OtherOperand);
-  if (OtherOpSCEV == SE->getCouldNotCompute())
-    return false;
+  unsigned BitWidth = cast<IntegerType>(BO->getType())->getBitWidth();
+  Type *WideTy = IntegerType::get(BO->getContext(), BitWidth * 2);
+  const SCEV *LHS = SE->getSCEV(BO->getOperand(0));
+  const SCEV *RHS = SE->getSCEV(BO->getOperand(1));
 
-  const SCEV *IVOpSCEV = SE->getSCEV(IVOperand);
-  const SCEV *ZeroSCEV = SE->getConstant(IVOpSCEV->getType(), 0);
+  bool Changed = false;
 
-  if (!BO->hasNoSignedWrap()) {
-    // Upgrade the add to an "add nsw" if we can prove that it will never
-    // sign-overflow or sign-underflow.
-
-    const SCEV *SignedMax =
-      SE->getConstant(APInt::getSignedMaxValue(IT->getBitWidth()));
-    const SCEV *SignedMin =
-      SE->getConstant(APInt::getSignedMinValue(IT->getBitWidth()));
-
-    // The addition "IVOperand + OtherOp" does not sign-overflow if the result
-    // is sign-representable in 2's complement in the given bit-width.
-    //
-    // If OtherOp is SLT 0, then for an IVOperand in [SignedMin - OtherOp,
-    // SignedMax], "IVOperand + OtherOp" is in [SignedMin, SignedMax + OtherOp].
-    // Everything in [SignedMin, SignedMax + OtherOp] is representable since
-    // SignedMax + OtherOp is at least -1.
-    //
-    // If OtherOp is SGE 0, then for an IVOperand in [SignedMin, SignedMax -
-    // OtherOp], "IVOperand + OtherOp" is in [SignedMin + OtherOp, SignedMax].
-    // Everything in [SignedMin + OtherOp, SignedMax] is representable since
-    // SignedMin + OtherOp is at most -1.
-    //
-    // It follows that for all values of IVOperand in [SignedMin - smin(0,
-    // OtherOp), SignedMax - smax(0, OtherOp)] the result of the add is
-    // representable (i.e. there is no sign-overflow).
-
-    const SCEV *UpperDelta = SE->getSMaxExpr(ZeroSCEV, OtherOpSCEV);
-    const SCEV *UpperLimit = SE->getMinusSCEV(SignedMax, UpperDelta);
-
-    bool NeverSignedOverflows =
-      SE->isKnownPredicate(ICmpInst::ICMP_SLE, IVOpSCEV, UpperLimit);
-
-    if (NeverSignedOverflows) {
-      const SCEV *LowerDelta = SE->getSMinExpr(ZeroSCEV, OtherOpSCEV);
-      const SCEV *LowerLimit = SE->getMinusSCEV(SignedMin, LowerDelta);
-
-      bool NeverSignedUnderflows =
-        SE->isKnownPredicate(ICmpInst::ICMP_SGE, IVOpSCEV, LowerLimit);
-      if (NeverSignedUnderflows) {
-        BO->setHasNoSignedWrap(true);
-        Changed = true;
-      }
+  if (!BO->hasNoUnsignedWrap()) {
+    const SCEV *ExtendAfterOp = SE->getZeroExtendExpr(SE->getSCEV(BO), WideTy);
+    const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
+      SE->getZeroExtendExpr(LHS, WideTy), SE->getZeroExtendExpr(RHS, WideTy),
+      SCEV::FlagAnyWrap);
+    if (ExtendAfterOp == OpAfterExtend) {
+      BO->setHasNoUnsignedWrap();
+      SE->forgetValue(BO);
+      Changed = true;
     }
   }
 
-  if (!BO->hasNoUnsignedWrap()) {
-    // Upgrade the add computing "IVOperand + OtherOp" to an "add nuw" if we can
-    // prove that it will never unsigned-overflow (i.e. the result will always
-    // be representable in the given bit-width).
-    //
-    // "IVOperand + OtherOp" is unsigned-representable in 2's complement iff it
-    // does not produce a carry.  "IVOperand + OtherOp" produces no carry iff
-    // IVOperand ULE (UnsignedMax - OtherOp).
-
-    const SCEV *UnsignedMax =
-      SE->getConstant(APInt::getMaxValue(IT->getBitWidth()));
-    const SCEV *UpperLimit = SE->getMinusSCEV(UnsignedMax, OtherOpSCEV);
-
-    bool NeverUnsignedOverflows =
-        SE->isKnownPredicate(ICmpInst::ICMP_ULE, IVOpSCEV, UpperLimit);
-
-    if (NeverUnsignedOverflows) {
-      BO->setHasNoUnsignedWrap(true);
+  if (!BO->hasNoSignedWrap()) {
+    const SCEV *ExtendAfterOp = SE->getSignExtendExpr(SE->getSCEV(BO), WideTy);
+    const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
+      SE->getSignExtendExpr(LHS, WideTy), SE->getSignExtendExpr(RHS, WideTy),
+      SCEV::FlagAnyWrap);
+    if (ExtendAfterOp == OpAfterExtend) {
+      BO->setHasNoSignedWrap();
+      SE->forgetValue(BO);
       Changed = true;
     }
   }
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 55a4455..c499c87 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -51,8 +51,7 @@ namespace {
       const DominatorTreeWrapperPass *DTWP =
           getAnalysisIfAvailable<DominatorTreeWrapperPass>();
       const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-      DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-      const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
+      const DataLayout &DL = F.getParent()->getDataLayout();
       const TargetLibraryInfo *TLI =
           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       AssumptionCache *AC =
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index fb1d83f..5867d65 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -120,12 +120,12 @@ static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
 /// string/memory copying library function \p Func.
 /// Acceptable functions are st[rp][n]?cpy, memove, memcpy, and memset.
 /// Their fortified (_chk) counterparts are also accepted.
-static bool checkStringCopyLibFuncSignature(Function *F, LibFunc::Func Func,
-                                            const DataLayout *DL) {
+static bool checkStringCopyLibFuncSignature(Function *F, LibFunc::Func Func) {
+  const DataLayout &DL = F->getParent()->getDataLayout();
   FunctionType *FT = F->getFunctionType();
   LLVMContext &Context = F->getContext();
   Type *PCharTy = Type::getInt8PtrTy(Context);
-  Type *SizeTTy = DL ? DL->getIntPtrType(Context) : nullptr;
+  Type *SizeTTy = DL.getIntPtrType(Context);
   unsigned NumParams = FT->getNumParams();
 
   // All string libfuncs return the same type as the first parameter.
@@ -208,10 +208,6 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) {
   if (Len == 0)
     return Dst;
 
-  // These optimizations require DataLayout.
-  if (!DL)
-    return nullptr;
-
   return emitStrLenMemCpy(Src, Dst, Len, B);
 }
 
@@ -230,9 +226,9 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
 
   // We have enough information to now generate the memcpy call to do the
   // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-  B.CreateMemCpy(
-      CpyDst, Src,
-      ConstantInt::get(DL->getIntPtrType(Src->getContext()), Len + 1), 1);
+  B.CreateMemCpy(CpyDst, Src,
+                 ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1),
+                 1);
   return Dst;
 }
 
@@ -269,10 +265,6 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
   if (SrcLen == 0 || Len == 0)
     return Dst;
 
-  // These optimizations require DataLayout.
-  if (!DL)
-    return nullptr;
-
   // We don't optimize this case
   if (Len < SrcLen)
     return nullptr;
@@ -297,24 +289,20 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   // of the input string and turn this into memchr.
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   if (!CharC) {
-    // These optimizations require DataLayout.
-    if (!DL)
-      return nullptr;
-
     uint64_t Len = GetStringLength(SrcStr);
     if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
       return nullptr;
 
-    return EmitMemChr(
-        SrcStr, CI->getArgOperand(1), // include nul.
-        ConstantInt::get(DL->getIntPtrType(CI->getContext()), Len), B, DL, TLI);
+    return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len),
+                      B, DL, TLI);
   }
 
   // Otherwise, the character is a constant, see if the first argument is
   // a string literal.  If so, we can constant fold.
   StringRef Str;
   if (!getConstantStringInfo(SrcStr, Str)) {
-    if (DL && CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
+    if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
       return B.CreateGEP(SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr");
     return nullptr;
   }
@@ -350,8 +338,8 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
   StringRef Str;
   if (!getConstantStringInfo(SrcStr, Str)) {
     // strrchr(s, 0) -> strchr(s, 0)
-    if (DL && CharC->isZero())
-      return EmitStrChr(SrcStr, '\0', B, DL, TLI);
+    if (CharC->isZero())
+      return EmitStrChr(SrcStr, '\0', B, TLI);
     return nullptr;
   }
 
@@ -398,12 +386,8 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
   uint64_t Len1 = GetStringLength(Str1P);
   uint64_t Len2 = GetStringLength(Str2P);
   if (Len1 && Len2) {
-    // These optimizations require DataLayout.
-    if (!DL)
-      return nullptr;
-
     return EmitMemCmp(Str1P, Str2P,
-                      ConstantInt::get(DL->getIntPtrType(CI->getContext()),
+                      ConstantInt::get(DL.getIntPtrType(CI->getContext()),
                                        std::min(Len1, Len2)),
                       B, DL, TLI);
   }
@@ -435,7 +419,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
   if (Length == 0) // strncmp(x,y,0)   -> 0
     return ConstantInt::get(CI->getType(), 0);
 
-  if (DL && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+  if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
     return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
 
   StringRef Str1, Str2;
@@ -462,17 +446,13 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strcpy, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strcpy))
     return nullptr;
 
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) // strcpy(x,x)  -> x
     return Src;
 
-  // These optimizations require DataLayout.
-  if (!DL)
-    return nullptr;
-
   // See if we can get the length of the input string.
   uint64_t Len = GetStringLength(Src);
   if (Len == 0)
@@ -481,7 +461,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
   B.CreateMemCpy(Dst, Src,
-                 ConstantInt::get(DL->getIntPtrType(CI->getContext()), Len), 1);
+                 ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), 1);
   return Dst;
 }
 
@@ -490,11 +470,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
   // Verify the "stpcpy" function prototype.
   FunctionType *FT = Callee->getFunctionType();
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy, DL))
-    return nullptr;
-
-  // These optimizations require DataLayout.
-  if (!DL)
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::stpcpy))
     return nullptr;
 
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
@@ -509,9 +485,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
     return nullptr;
 
   Type *PT = FT->getParamType(0);
-  Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len);
+  Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
   Value *DstEnd =
-      B.CreateGEP(Dst, ConstantInt::get(DL->getIntPtrType(PT), Len - 1));
+      B.CreateGEP(Dst, ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
@@ -523,7 +499,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::strncpy))
     return nullptr;
 
   Value *Dst = CI->getArgOperand(0);
@@ -551,17 +527,13 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   if (Len == 0)
     return Dst; // strncpy(x, y, 0) -> x
 
-  // These optimizations require DataLayout.
-  if (!DL)
-    return nullptr;
-
   // Let strncpy handle the zero padding
   if (Len > SrcLen + 1)
     return nullptr;
 
   Type *PT = FT->getParamType(0);
   // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
-  B.CreateMemCpy(Dst, Src, ConstantInt::get(DL->getIntPtrType(PT), Len), 1);
+  B.CreateMemCpy(Dst, Src, ConstantInt::get(DL.getIntPtrType(PT), Len), 1);
 
   return Dst;
 }
@@ -629,8 +601,8 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
   }
 
   // strpbrk(s, "a") -> strchr(s, 'a')
-  if (DL && HasS2 && S2.size() == 1)
-    return EmitStrChr(CI->getArgOperand(0), S2[0], B, DL, TLI);
+  if (HasS2 && S2.size() == 1)
+    return EmitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
 
   return nullptr;
 }
@@ -706,7 +678,7 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) {
   }
 
   // strcspn(s, "") -> strlen(s)
-  if (DL && HasS2 && S2.empty())
+  if (HasS2 && S2.empty())
     return EmitStrLen(CI->getArgOperand(0), B, DL, TLI);
 
   return nullptr;
@@ -725,7 +697,7 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
     return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
 
   // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
-  if (DL && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+  if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
     Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, DL, TLI);
     if (!StrLen)
       return nullptr;
@@ -767,12 +739,98 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
 
   // fold strstr(x, "y") -> strchr(x, 'y').
   if (HasStr2 && ToFindStr.size() == 1) {
-    Value *StrChr = EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, DL, TLI);
+    Value *StrChr = EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
     return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
   }
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isIntegerTy(32) ||
+      !FT->getParamType(2)->isIntegerTy() ||
+      !FT->getReturnType()->isPointerTy())
+    return nullptr;
+
+  Value *SrcStr = CI->getArgOperand(0);
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+
+  // memchr(x, y, 0) -> null
+  if (LenC && LenC->isNullValue())
+    return Constant::getNullValue(CI->getType());
+
+  // From now on we need at least constant length and string.
+  StringRef Str;
+  if (!LenC || !getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
+    return nullptr;
+
+  // Truncate the string to LenC. If Str is smaller than LenC we will still only
+  // scan the string, as reading past the end of it is undefined and we can just
+  // return null if we don't find the char.
+  Str = Str.substr(0, LenC->getZExtValue());
+
+  // If the char is variable but the input str and length are not we can turn
+  // this memchr call into a simple bit field test. Of course this only works
+  // when the return value is only checked against null.
+  //
+  // It would be really nice to reuse switch lowering here but we can't change
+  // the CFG at this point.
+  //
+  // memchr("\r\n", C, 2) != nullptr -> (C & ((1 << '\r') | (1 << '\n'))) != 0
+  //   after bounds check.
+  if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) {
+    unsigned char Max =
+        *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()),
+                          reinterpret_cast<const unsigned char *>(Str.end()));
+
+    // Make sure the bit field we're about to create fits in a register on the
+    // target.
+    // FIXME: On a 64 bit architecture this prevents us from using the
+    // interesting range of alpha ascii chars. We could do better by emitting
+    // two bitfields or shifting the range by 64 if no lower chars are used.
+    if (!DL.fitsInLegalInteger(Max + 1))
+      return nullptr;
+
+    // For the bit field use a power-of-2 type with at least 8 bits to avoid
+    // creating unnecessary illegal types.
+    unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max));
+
+    // Now build the bit field.
+    APInt Bitfield(Width, 0);
+    for (char C : Str)
+      Bitfield.setBit((unsigned char)C);
+    Value *BitfieldC = B.getInt(Bitfield);
+
+    // First check that the bit field access is within bounds.
+    Value *C = B.CreateZExtOrTrunc(CI->getArgOperand(1), BitfieldC->getType());
+    Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width),
+                                 "memchr.bounds");
+
+    // Create code that checks if the given bit is set in the field.
+    Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C);
+    Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits");
+
+    // Finally merge both checks and cast to pointer type. The inttoptr
+    // implicitly zexts the i1 to intptr type.
+    return B.CreateIntToPtr(B.CreateAnd(Bounds, Bits, "memchr"), CI->getType());
+  }
+
+  // Check if all arguments are constants.  If so, we can constant fold.
+  if (!CharC)
+    return nullptr;
+
+  // Compute the offset.
+  size_t I = Str.find(CharC->getSExtValue() & 0xFF);
+  if (I == StringRef::npos) // Didn't find the char.  memchr returns null.
+    return Constant::getNullValue(CI->getType());
+
+  // memchr(s+n,c,l) -> gep(s+n+i,c)
+  return B.CreateGEP(SrcStr, B.getInt64(I), "memchr");
+}
+
 Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
@@ -827,11 +885,8 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // These optimizations require DataLayout.
-  if (!DL)
-    return nullptr;
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy))
     return nullptr;
 
   // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
@@ -842,11 +897,8 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // These optimizations require DataLayout.
-  if (!DL)
-    return nullptr;
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove))
     return nullptr;
 
   // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
@@ -857,11 +909,8 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
 
 Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
-  // These optimizations require DataLayout.
-  if (!DL)
-    return nullptr;
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset))
     return nullptr;
 
   // memset(p, v, n) -> llvm.memset(p, v, n, 1)
@@ -1521,7 +1570,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
 
   // printf("x") -> putchar('x'), even for '%'.
   if (FormatStr.size() == 1) {
-    Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, DL, TLI);
+    Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TLI);
     if (CI->use_empty() || !Res)
       return Res;
     return B.CreateIntCast(Res, CI->getType(), true);
@@ -1534,7 +1583,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
     // pass to be run after this pass, to merge duplicate strings.
     FormatStr = FormatStr.drop_back();
     Value *GV = B.CreateGlobalString(FormatStr, "str");
-    Value *NewCI = EmitPutS(GV, B, DL, TLI);
+    Value *NewCI = EmitPutS(GV, B, TLI);
     return (CI->use_empty() || !NewCI)
                ? NewCI
                : ConstantInt::get(CI->getType(), FormatStr.size() + 1);
@@ -1544,7 +1593,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
   // printf("%c", chr) --> putchar(chr)
   if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
       CI->getArgOperand(1)->getType()->isIntegerTy()) {
-    Value *Res = EmitPutChar(CI->getArgOperand(1), B, DL, TLI);
+    Value *Res = EmitPutChar(CI->getArgOperand(1), B, TLI);
 
     if (CI->use_empty() || !Res)
       return Res;
@@ -1554,7 +1603,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
   // printf("%s\n", str) --> puts(str)
   if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
       CI->getArgOperand(1)->getType()->isPointerTy()) {
-    return EmitPutS(CI->getArgOperand(1), B, DL, TLI);
+    return EmitPutS(CI->getArgOperand(1), B, TLI);
   }
   return nullptr;
 }
@@ -1600,16 +1649,11 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
       if (FormatStr[i] == '%')
         return nullptr; // we found a format specifier, bail out.
 
-    // These optimizations require DataLayout.
-    if (!DL)
-      return nullptr;
-
     // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
-    B.CreateMemCpy(
-        CI->getArgOperand(0), CI->getArgOperand(1),
-        ConstantInt::get(DL->getIntPtrType(CI->getContext()),
-                         FormatStr.size() + 1),
-        1); // Copy the null byte.
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                   ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+                                    FormatStr.size() + 1),
+                   1); // Copy the null byte.
     return ConstantInt::get(CI->getType(), FormatStr.size());
   }
 
@@ -1634,10 +1678,6 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
   }
 
   if (FormatStr[1] == 's') {
-    // These optimizations require DataLayout.
-    if (!DL)
-      return nullptr;
-
     // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
     if (!CI->getArgOperand(2)->getType()->isPointerTy())
       return nullptr;
@@ -1702,13 +1742,9 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
       if (FormatStr[i] == '%') // Could handle %% -> % if we cared.
         return nullptr;        // We found a format specifier.
 
-    // These optimizations require DataLayout.
-    if (!DL)
-      return nullptr;
-
     return EmitFWrite(
         CI->getArgOperand(1),
-        ConstantInt::get(DL->getIntPtrType(CI->getContext()), FormatStr.size()),
+        ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()),
         CI->getArgOperand(0), B, DL, TLI);
   }
 
@@ -1723,14 +1759,14 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
     // fprintf(F, "%c", chr) --> fputc(chr, F)
     if (!CI->getArgOperand(2)->getType()->isIntegerTy())
       return nullptr;
-    return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI);
+    return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
   }
 
   if (FormatStr[1] == 's') {
     // fprintf(F, "%s", str) --> fputs(str, F)
     if (!CI->getArgOperand(2)->getType()->isPointerTy())
       return nullptr;
-    return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI);
+    return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
   }
   return nullptr;
 }
@@ -1790,7 +1826,7 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
   // This optimisation is only valid, if the return value is unused.
   if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
     Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
-    Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, DL, TLI);
+    Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TLI);
     return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
   }
 
@@ -1802,10 +1838,6 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
 
   Function *Callee = CI->getCalledFunction();
 
-  // These optimizations require DataLayout.
-  if (!DL)
-    return nullptr;
-
   // Require two pointers.  Also, we can't optimize if return value is used.
   FunctionType *FT = Callee->getFunctionType();
   if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
@@ -1820,7 +1852,7 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
   // Known to have no uses (see above).
   return EmitFWrite(
       CI->getArgOperand(0),
-      ConstantInt::get(DL->getIntPtrType(CI->getContext()), Len - 1),
+      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
       CI->getArgOperand(1), B, DL, TLI);
 }
 
@@ -1839,7 +1871,7 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
 
   if (Str.empty() && CI->use_empty()) {
     // puts("") -> putchar('\n')
-    Value *Res = EmitPutChar(B.getInt32('\n'), B, DL, TLI);
+    Value *Res = EmitPutChar(B.getInt32('\n'), B, TLI);
     if (CI->use_empty() || !Res)
       return Res;
     return B.CreateIntCast(Res, CI->getType(), true);
@@ -1906,6 +1938,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeStrCSpn(CI, Builder);
     case LibFunc::strstr:
       return optimizeStrStr(CI, Builder);
+    case LibFunc::memchr:
+      return optimizeMemChr(CI, Builder);
     case LibFunc::memcmp:
       return optimizeMemCmp(CI, Builder);
     case LibFunc::memcpy:
@@ -2089,9 +2123,9 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
 }
 
 LibCallSimplifier::LibCallSimplifier(
-    const DataLayout *DL, const TargetLibraryInfo *TLI,
+    const DataLayout &DL, const TargetLibraryInfo *TLI,
     function_ref<void(Instruction *, Value *)> Replacer)
-    : FortifiedSimplifier(DL, TLI), DL(DL), TLI(TLI), UnsafeFPShrink(false),
+    : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), UnsafeFPShrink(false),
       Replacer(Replacer) {}
 
 void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
@@ -2187,7 +2221,7 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
 Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memcpy_chk))
     return nullptr;
 
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
@@ -2201,7 +2235,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &
 Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memmove_chk))
     return nullptr;
 
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
@@ -2215,7 +2249,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<>
 Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
 
-  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, LibFunc::memset_chk))
     return nullptr;
 
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
@@ -2231,8 +2265,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
                                                       LibFunc::Func Func) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
+  const DataLayout &DL = CI->getModule()->getDataLayout();
 
-  if (!checkStringCopyLibFuncSignature(Callee, Func, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, Func))
     return nullptr;
 
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
@@ -2250,7 +2285,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   // TODO: It might be nice to get a maximum length out of the possible
   // string lengths for varying.
   if (isFortifiedCallFoldable(CI, 2, 1, true)) {
-    Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6));
+    Value *Ret = EmitStrCpy(Dst, Src, B, TLI, Name.substr(2, 6));
     return Ret;
   } else if (!OnlyLowerUnknownSize) {
     // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk.
@@ -2258,11 +2293,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
     if (Len == 0)
       return nullptr;
 
-    // This optimization requires DataLayout.
-    if (!DL)
-      return nullptr;
-
-    Type *SizeTTy = DL->getIntPtrType(CI->getContext());
+    Type *SizeTTy = DL.getIntPtrType(CI->getContext());
     Value *LenV = ConstantInt::get(SizeTTy, Len);
     Value *Ret = EmitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
     // If the function was an __stpcpy_chk, and we were able to fold it into
@@ -2280,12 +2311,11 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
 
-  if (!checkStringCopyLibFuncSignature(Callee, Func, DL))
+  if (!checkStringCopyLibFuncSignature(Callee, Func))
     return nullptr;
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
-    Value *Ret =
-        EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                    CI->getArgOperand(2), B, DL, TLI, Name.substr(2, 7));
+    Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                             CI->getArgOperand(2), B, TLI, Name.substr(2, 7));
     return Ret;
   }
   return nullptr;
@@ -2328,8 +2358,6 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
   return nullptr;
 }
 
-FortifiedLibCallSimplifier::
-FortifiedLibCallSimplifier(const DataLayout *DL, const TargetLibraryInfo *TLI,
-                           bool OnlyLowerUnknownSize)
-  : DL(DL), TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {
-}
+FortifiedLibCallSimplifier::FortifiedLibCallSimplifier(
+    const TargetLibraryInfo *TLI, bool OnlyLowerUnknownSize)
+    : TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {}
diff --git a/lib/Transforms/Utils/SymbolRewriter.cpp b/lib/Transforms/Utils/SymbolRewriter.cpp
index b343cc4..a2a54da 100644
--- a/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -60,6 +60,7 @@
 #define DEBUG_TYPE "symbol-rewriter"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Pass.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -72,15 +73,15 @@
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 
 using namespace llvm;
+using namespace SymbolRewriter;
 
 static cl::list<std::string> RewriteMapFiles("rewrite-map-file",
                                              cl::desc("Symbol Rewrite Map"),
                                              cl::value_desc("filename"));
 
-namespace llvm {
-namespace SymbolRewriter {
-void rewriteComdat(Module &M, GlobalObject *GO, const std::string &Source,
-                   const std::string &Target) {
+static void rewriteComdat(Module &M, GlobalObject *GO,
+                          const std::string &Source,
+                          const std::string &Target) {
   if (Comdat *CD = GO->getComdat()) {
     auto &Comdats = M.getComdatSymbolTable();
 
@@ -92,6 +93,7 @@ void rewriteComdat(Module &M, GlobalObject *GO, const std::string &Source,
   }
 }
 
+namespace {
 template <RewriteDescriptor::Type DT, typename ValueType,
           ValueType *(llvm::Module::*Get)(StringRef) const>
 class ExplicitRewriteDescriptor : public RewriteDescriptor {
@@ -226,6 +228,7 @@ typedef PatternRewriteDescriptor<RewriteDescriptor::Type::NamedAlias,
                                  &llvm::Module::getNamedAlias,
                                  &llvm::Module::aliases>
     PatternRewriteNamedAliasDescriptor;
+} // namespace
 
 bool RewriteMapParser::parse(const std::string &MapFile,
                              RewriteDescriptorList *DL) {
@@ -489,8 +492,6 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
 
   return true;
 }
-}
-}
 
 namespace {
 class RewriteSymbols : public ModulePass {
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 49c0902..54c7688 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -291,14 +291,18 @@ static Metadata *MapMetadataImpl(const Metadata *MD,
     return nullptr;
   }
 
+  // Note: this cast precedes the Flags check so we always get its associated
+  // assertion.
   const MDNode *Node = cast<MDNode>(MD);
-  assert(Node->isResolved() && "Unexpected unresolved node");
 
   // If this is a module-level metadata and we know that nothing at the
   // module level is changing, then use an identity mapping.
   if (Flags & RF_NoModuleLevelChanges)
     return mapToSelf(VM, MD);
 
+  // Require resolved nodes whenever metadata might be remapped.
+  assert(Node->isResolved() && "Unexpected unresolved node");
+
   if (Node->isDistinct())
     return mapDistinctNode(Node, Cycles, VM, Flags, TypeMapper, Materializer);
 
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 525c050..29fb01f 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
@@ -206,8 +207,6 @@ namespace {
       AA = &P->getAnalysis<AliasAnalysis>();
       DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
       SE = &P->getAnalysis<ScalarEvolution>();
-      DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>();
-      DL = DLP ? &DLP->getDataLayout() : nullptr;
       TTI = IgnoreTargetInfo
                 ? nullptr
                 : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
@@ -222,7 +221,6 @@ namespace {
     AliasAnalysis *AA;
     DominatorTree *DT;
     ScalarEvolution *SE;
-    const DataLayout *DL;
     const TargetTransformInfo *TTI;
 
     // FIXME: const correct?
@@ -442,8 +440,6 @@ namespace {
       AA = &getAnalysis<AliasAnalysis>();
       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
       SE = &getAnalysis<ScalarEvolution>();
-      DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-      DL = DLP ? &DLP->getDataLayout() : nullptr;
       TTI = IgnoreTargetInfo
                 ? nullptr
                 : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
@@ -642,19 +638,19 @@ namespace {
             dyn_cast<SCEVConstant>(OffsetSCEV)) {
         ConstantInt *IntOff = ConstOffSCEV->getValue();
         int64_t Offset = IntOff->getSExtValue();
-
+        const DataLayout &DL = I->getModule()->getDataLayout();
         Type *VTy = IPtr->getType()->getPointerElementType();
-        int64_t VTyTSS = (int64_t) DL->getTypeStoreSize(VTy);
+        int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
 
         Type *VTy2 = JPtr->getType()->getPointerElementType();
         if (VTy != VTy2 && Offset < 0) {
-          int64_t VTy2TSS = (int64_t) DL->getTypeStoreSize(VTy2);
+          int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
           OffsetInElmts = Offset/VTy2TSS;
-          return (abs64(Offset) % VTy2TSS) == 0;
+          return (std::abs(Offset) % VTy2TSS) == 0;
         }
 
         OffsetInElmts = Offset/VTyTSS;
-        return (abs64(Offset) % VTyTSS) == 0;
+        return (std::abs(Offset) % VTyTSS) == 0;
       }
 
       return false;
@@ -846,7 +842,7 @@ namespace {
 
     // It is important to cleanup here so that future iterations of this
     // function have less work to do.
-    (void) SimplifyInstructionsInBlock(&BB, DL, AA->getTargetLibraryInfo());
+    (void)SimplifyInstructionsInBlock(&BB, AA->getTargetLibraryInfo());
     return true;
   }
 
@@ -900,10 +896,6 @@ namespace {
       return false;
     }
 
-    // We can't vectorize memory operations without target data
-    if (!DL && IsSimpleLoadStore)
-      return false;
-
     Type *T1, *T2;
     getInstructionTypes(I, T1, T2);
 
@@ -938,9 +930,8 @@ namespace {
     if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
       return false;
 
-    if ((!Config.VectorizePointers || !DL) &&
-        (T1->getScalarType()->isPointerTy() ||
-         T2->getScalarType()->isPointerTy()))
+    if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
+                                      T2->getScalarType()->isPointerTy()))
       return false;
 
     if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
@@ -985,8 +976,8 @@ namespace {
       unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
       int64_t OffsetInElmts = 0;
       if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-            IAddressSpace, JAddressSpace,
-            OffsetInElmts) && abs64(OffsetInElmts) == 1) {
+                         IAddressSpace, JAddressSpace, OffsetInElmts) &&
+          std::abs(OffsetInElmts) == 1) {
         FixedOrder = (int) OffsetInElmts;
         unsigned BottomAlignment = IAlignment;
         if (OffsetInElmts < 0) BottomAlignment = JAlignment;
@@ -1001,8 +992,8 @@ namespace {
           // An aligned load or store is possible only if the instruction
           // with the lower offset has an alignment suitable for the
           // vector type.
-
-          unsigned VecAlignment = DL->getPrefTypeAlignment(VType);
+          const DataLayout &DL = I->getModule()->getDataLayout();
+          unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
           if (BottomAlignment < VecAlignment)
             return false;
         }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6142306..b7d0ae4 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -218,6 +218,15 @@ public:
                          R.getInstr()) {}
 };
 
+/// A helper function for converting Scalar types to vector types.
+/// If the incoming type is void, we return void. If the VF is 1, we return
+/// the scalar type.
+static Type* ToVectorTy(Type *Scalar, unsigned VF) {
+  if (Scalar->isVoidTy() || VF == 1)
+    return Scalar;
+  return VectorType::get(Scalar, VF);
+}
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -235,13 +244,13 @@ public:
 class InnerLoopVectorizer {
 public:
   InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                      DominatorTree *DT, const DataLayout *DL,
-                      const TargetLibraryInfo *TLI, unsigned VecWidth,
+                      DominatorTree *DT, const TargetLibraryInfo *TLI,
+                      const TargetTransformInfo *TTI, unsigned VecWidth,
                       unsigned UnrollFactor)
-      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
+      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
         VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
         Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
-        Legal(nullptr) {}
+        Legal(nullptr), AddedSafetyChecks(false) {}
 
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *L) {
@@ -255,6 +264,11 @@ public:
     updateAnalysis();
   }
 
+  // Return true if any runtime check is added.
+  bool IsSafetyChecksAdded() {
+    return AddedSafetyChecks;
+  }
+
   virtual ~InnerLoopVectorizer() {}
 
 protected:
@@ -389,10 +403,10 @@ protected:
   DominatorTree *DT;
   /// Alias Analysis.
   AliasAnalysis *AA;
-  /// Data Layout.
-  const DataLayout *DL;
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
+  /// Target Transform Info.
+  const TargetTransformInfo *TTI;
 
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
@@ -434,14 +448,17 @@ protected:
   EdgeMaskCache MaskCache;
 
   LoopVectorizationLegality *Legal;
+
+  // Record whether runtime check is added.
+  bool AddedSafetyChecks;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
 public:
   InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
-                    DominatorTree *DT, const DataLayout *DL,
-                    const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
-    InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
+                    DominatorTree *DT, const TargetLibraryInfo *TLI,
+                    const TargetTransformInfo *TTI, unsigned UnrollFactor)
+      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -488,7 +505,7 @@ static std::string getDebugLocString(const Loop *L) {
     raw_string_ostream OS(Result);
     const DebugLoc LoopDbgLoc = L->getStartLoc();
     if (!LoopDbgLoc.isUnknown())
-      LoopDbgLoc.print(L->getHeader()->getContext(), OS);
+      LoopDbgLoc.print(OS);
     else
       // Just print the module name.
       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
@@ -543,14 +560,13 @@ static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *F
 /// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
-  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
-                            DominatorTree *DT, TargetLibraryInfo *TLI,
-                            AliasAnalysis *AA, Function *F,
-                            const TargetTransformInfo *TTI,
+  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
+                            TargetLibraryInfo *TLI, AliasAnalysis *AA,
+                            Function *F, const TargetTransformInfo *TTI,
                             LoopAccessAnalysis *LAA)
-      : NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
-        TLI(TLI), TheFunction(F), TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
+      : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr),
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -842,8 +858,6 @@ private:
   Loop *TheLoop;
   /// Scev analysis.
   ScalarEvolution *SE;
-  /// DataLayout analysis.
-  const DataLayout *DL;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
   /// Parent function
@@ -884,7 +898,7 @@ private:
 
   ValueToValueMap Strides;
   SmallPtrSet<Value *, 8> StrideSet;
-  
+
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction*, 8> MaskedOp;
@@ -902,10 +916,9 @@ public:
   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
                              LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
-                             const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             AssumptionCache *AC, const Function *F,
-                             const LoopVectorizeHints *Hints)
-      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI),
+                             const TargetLibraryInfo *TLI, AssumptionCache *AC,
+                             const Function *F, const LoopVectorizeHints *Hints)
+      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
         TheFunction(F), Hints(Hints) {
     CodeMetrics::collectEphemeralValues(L, AC, EphValues);
   }
@@ -958,11 +971,6 @@ private:
   /// width. Vector width of one means scalar.
   unsigned getInstructionCost(Instruction *I, unsigned VF);
 
-  /// A helper function for converting Scalar types to vector types.
-  /// If the incoming type is void, we return void. If the VF is 1, we return
-  /// the scalar type.
-  static Type* ToVectorTy(Type *Scalar, unsigned VF);
-
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
@@ -988,8 +996,6 @@ private:
   LoopVectorizationLegality *Legal;
   /// Vector target information.
   const TargetTransformInfo &TTI;
-  /// Target data layout information.
-  const DataLayout *DL;
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
   const Function *TheFunction;
@@ -1254,7 +1260,6 @@ struct LoopVectorize : public FunctionPass {
   }
 
   ScalarEvolution *SE;
-  const DataLayout *DL;
   LoopInfo *LI;
   TargetTransformInfo *TTI;
   DominatorTree *DT;
@@ -1270,8 +1275,6 @@ struct LoopVectorize : public FunctionPass {
 
   bool runOnFunction(Function &F) override {
     SE = &getAnalysis<ScalarEvolution>();
-    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-    DL = DLP ? &DLP->getDataLayout() : nullptr;
     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -1292,12 +1295,6 @@ struct LoopVectorize : public FunctionPass {
     if (!TTI->getNumberOfRegisters(true))
       return false;
 
-    if (!DL) {
-      DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()
-                   << ": Missing data layout\n");
-      return false;
-    }
-
     // Build up a worklist of inner-loops to vectorize. This is necessary as
     // the act of vectorizing or partially unrolling a loop creates new loops
     // and can invalidate iterators across the loops.
@@ -1317,6 +1314,40 @@ struct LoopVectorize : public FunctionPass {
     return Changed;
   }
 
+  static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+    SmallVector<Metadata *, 4> MDs;
+    // Reserve first location for self reference to the LoopID metadata node.
+    MDs.push_back(nullptr);
+    bool IsUnrollMetadata = false;
+    MDNode *LoopID = L->getLoopID();
+    if (LoopID) {
+      // First find existing loop unrolling disable metadata.
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+        MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+        if (MD) {
+          const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+          IsUnrollMetadata =
+              S && S->getString().startswith("llvm.loop.unroll.disable");
+        }
+        MDs.push_back(LoopID->getOperand(i));
+      }
+    }
+
+    if (!IsUnrollMetadata) {
+      // Add runtime unroll disable metadata.
+      LLVMContext &Context = L->getHeader()->getContext();
+      SmallVector<Metadata *, 1> DisableOperands;
+      DisableOperands.push_back(
+          MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+      MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+      MDs.push_back(DisableNode);
+      MDNode *NewLoopID = MDNode::get(Context, MDs);
+      // Set operand 0 to refer to the loop id itself.
+      NewLoopID->replaceOperandWith(0, NewLoopID);
+      L->setLoopID(NewLoopID);
+    }
+  }
+
   bool processLoop(Loop *L) {
     assert(L->empty() && "Only process inner loops.");
 
@@ -1391,7 +1422,7 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI, LAA);
+    LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
       emitMissedWarning(F, L, Hints);
@@ -1399,8 +1430,7 @@ struct LoopVectorize : public FunctionPass {
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F,
-                                  &Hints);
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
@@ -1464,14 +1494,20 @@ struct LoopVectorize : public FunctionPass {
 
       // We decided not to vectorize, but we may want to unroll.
 
-      InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
+      InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF);
       Unroller.vectorize(&LVL);
     } else {
       // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
+      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF);
       LB.vectorize(&LVL);
       ++LoopsVectorized;
 
+      // Add metadata to disable runtime unrolling scalar loop when there's no
+      // runtime check about strides and memory. Because at this situation,
+      // scalar loop is rarely used not worthy to be unrolled.
+      if (!LB.IsSafetyChecksAdded())
+        AddRuntimeUnrollDisableMetaData(L);
+
       // Report the vectorization decision.
       emitOptimizationRemark(
           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
@@ -1561,10 +1597,10 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
 /// \brief Find the operand of the GEP that should be checked for consecutive
 /// stores. This ignores trailing indices that have no effect on the final
 /// pointer.
-static unsigned getGEPInductionOperand(const DataLayout *DL,
-                                       const GetElementPtrInst *Gep) {
+static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
+  const DataLayout &DL = Gep->getModule()->getDataLayout();
   unsigned LastOperand = Gep->getNumOperands() - 1;
-  unsigned GEPAllocSize = DL->getTypeAllocSize(
+  unsigned GEPAllocSize = DL.getTypeAllocSize(
       cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
 
   // Walk backwards and try to peel off zeros.
@@ -1575,7 +1611,7 @@ static unsigned getGEPInductionOperand(const DataLayout *DL,
 
     // If it's a type with the same allocation size as the result of the GEP we
     // can peel off the zero index.
-    if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
+    if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize)
       break;
     --LastOperand;
   }
@@ -1621,7 +1657,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     return II.getConsecutiveDirection();
   }
 
-  unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
+  unsigned InductionOperand = getGEPInductionOperand(Gep);
 
   // Check that all of the gep indices are uniform except for our induction
   // operand.
@@ -1714,11 +1750,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
   // An alignment of 0 means target abi alignment. We need to use the scalar's
   // target abi alignment in such a case.
+  const DataLayout &DL = Instr->getModule()->getDataLayout();
   if (!Alignment)
-    Alignment = DL->getABITypeAlignment(ScalarDataTy);
+    Alignment = DL.getABITypeAlignment(ScalarDataTy);
   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
-  unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
-  unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+  unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
+  unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
 
   if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
       !Legal->isMaskRequired(SI))
@@ -1759,7 +1796,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     // The last index does not have to be the induction. It can be
     // consecutive and be a function of the index. For example A[I+1];
     unsigned NumOperands = Gep->getNumOperands();
-    unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
+    unsigned InductionOperand = getGEPInductionOperand(Gep);
     // Create the new GEP with the new induction variable.
     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
 
@@ -2080,9 +2117,11 @@ void InnerLoopVectorizer::createEmptyLoop() {
   ExitCount = SE->getAddExpr(BackedgeTakeCount,
                              SE->getConstant(BackedgeTakeCount->getType(), 1));
 
+  const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();
+
   // Expand the trip count and place the new instructions in the preheader.
   // Notice that the pre-header does not change, only the loop body.
-  SCEVExpander Exp(*SE, "induction");
+  SCEVExpander Exp(*SE, DL, "induction");
 
   // We need to test whether the backedge-taken count is uint##_max. Adding one
   // to it will cause overflow and an incorrect loop trip count in the vector
@@ -2218,6 +2257,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
   std::tie(FirstCheckInst, StrideCheck) =
       addStrideCheck(LastBypassBlock->getTerminator());
   if (StrideCheck) {
+    AddedSafetyChecks = true;
     // Create a new block containing the stride check.
     BasicBlock *CheckBlock =
         LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
@@ -2242,6 +2282,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
   std::tie(FirstCheckInst, MemRuntimeCheck) =
     Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());
   if (MemRuntimeCheck) {
+    AddedSafetyChecks = true;
     // Create a new block containing the memory check.
     BasicBlock *CheckBlock =
         LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");
@@ -2480,10 +2521,9 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
   }
 }
 
-Value *createMinMaxOp(IRBuilder<> &Builder,
-                      LoopVectorizationLegality::MinMaxReductionKind RK,
-                      Value *Left,
-                      Value *Right) {
+static Value *createMinMaxOp(IRBuilder<> &Builder,
+                             LoopVectorizationLegality::MinMaxReductionKind RK,
+                             Value *Left, Value *Right) {
   CmpInst::Predicate P = CmpInst::ICMP_NE;
   switch (RK) {
   default:
@@ -2594,6 +2634,95 @@ static Value *addFastMathFlag(Value *V) {
   return V;
 }
 
+/// Estimate the overhead of scalarizing a value. Insert and Extract are set if
+/// the result needs to be inserted and/or extracted from vectors.
+static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
+                                         const TargetTransformInfo &TTI) {
+  if (Ty->isVoidTy())
+    return 0;
+
+  assert(Ty->isVectorTy() && "Can only scalarize vectors");
+  unsigned Cost = 0;
+
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    if (Insert)
+      Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    if (Extract)
+      Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+
+  return Cost;
+}
+
+// Estimate cost of a call instruction CI if it were vectorized with factor VF.
+// Return the cost of the instruction, including scalarization overhead if it's
+// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
+// i.e. either vector version isn't available, or is too expensive.
+static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
+                                  const TargetTransformInfo &TTI,
+                                  const TargetLibraryInfo *TLI,
+                                  bool &NeedToScalarize) {
+  Function *F = CI->getCalledFunction();
+  StringRef FnName = CI->getCalledFunction()->getName();
+  Type *ScalarRetTy = CI->getType();
+  SmallVector<Type *, 4> Tys, ScalarTys;
+  for (auto &ArgOp : CI->arg_operands())
+    ScalarTys.push_back(ArgOp->getType());
+
+  // Estimate cost of scalarized vector call. The source operands are assumed
+  // to be vectors, so we need to extract individual elements from there,
+  // execute VF scalar calls, and then gather the result into the vector return
+  // value.
+  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
+  if (VF == 1)
+    return ScalarCallCost;
+
+  // Compute corresponding vector type for return value and arguments.
+  Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+  for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
+    Tys.push_back(ToVectorTy(ScalarTys[i], VF));
+
+  // Compute costs of unpacking argument values for the scalar calls and
+  // packing the return values to a vector.
+  unsigned ScalarizationCost =
+      getScalarizationOverhead(RetTy, true, false, TTI);
+  for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
+    ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
+
+  unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
+
+  // If we can't emit a vector call for this function, then the currently found
+  // cost is the cost we need to return.
+  NeedToScalarize = true;
+  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
+    return Cost;
+
+  // If the corresponding vector cost is cheaper, return its cost.
+  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
+  if (VectorCallCost < Cost) {
+    NeedToScalarize = false;
+    return VectorCallCost;
+  }
+  return Cost;
+}
+
+// Estimate cost of an intrinsic call instruction CI if it were vectorized with
+// factor VF.  Return the cost of the instruction, including scalarization
+// overhead if it's needed.
+static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
+                                       const TargetTransformInfo &TTI,
+                                       const TargetLibraryInfo *TLI) {
+  Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+  assert(ID && "Expected intrinsic call!");
+
+  Type *RetTy = ToVectorTy(CI->getType(), VF);
+  SmallVector<Type *, 4> Tys;
+  for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+    Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+
+  return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+}
+
 void InnerLoopVectorizer::vectorizeLoop() {
   //===------------------------------------------------===//
   //
@@ -3181,37 +3310,71 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
 
       Module *M = BB->getParent()->getParent();
       CallInst *CI = cast<CallInst>(it);
+
+      StringRef FnName = CI->getCalledFunction()->getName();
+      Function *F = CI->getCalledFunction();
+      Type *RetTy = ToVectorTy(CI->getType(), VF);
+      SmallVector<Type *, 4> Tys;
+      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+        Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+
       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
-      assert(ID && "Not an intrinsic call!");
-      switch (ID) {
-      case Intrinsic::assume:
-      case Intrinsic::lifetime_end:
-      case Intrinsic::lifetime_start:
+      if (ID &&
+          (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+           ID == Intrinsic::lifetime_start)) {
         scalarizeInstruction(it);
         break;
-      default:
-        bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1);
-        for (unsigned Part = 0; Part < UF; ++Part) {
-          SmallVector<Value *, 4> Args;
-          for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-            if (HasScalarOpd && i == 1) {
-              Args.push_back(CI->getArgOperand(i));
-              continue;
-            }
-            VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
-            Args.push_back(Arg[Part]);
-          }
-          Type *Tys[] = {CI->getType()};
-          if (VF > 1)
-            Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+      }
+      // The flag shows whether we use Intrinsic or a usual Call for vectorized
+      // version of the instruction.
+      // Is it beneficial to perform intrinsic call compared to lib call?
+      bool NeedToScalarize;
+      unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+      bool UseVectorIntrinsic =
+          ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+      if (!UseVectorIntrinsic && NeedToScalarize) {
+        scalarizeInstruction(it);
+        break;
+      }
 
-          Function *F = Intrinsic::getDeclaration(M, ID, Tys);
-          Entry[Part] = Builder.CreateCall(F, Args);
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        SmallVector<Value *, 4> Args;
+        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+          Value *Arg = CI->getArgOperand(i);
+          // Some intrinsics have a scalar argument - don't replace it with a
+          // vector.
+          if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
+            VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
+            Arg = VectorArg[Part];
+          }
+          Args.push_back(Arg);
         }
 
-        propagateMetadata(Entry, it);
-        break;
+        Function *VectorF;
+        if (UseVectorIntrinsic) {
+          // Use vector version of the intrinsic.
+          Type *TysForDecl[] = {CI->getType()};
+          if (VF > 1)
+            TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+          VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+        } else {
+          // Use vector version of the library call.
+          StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+          assert(!VFnName.empty() && "Vector function name is empty.");
+          VectorF = M->getFunction(VFnName);
+          if (!VectorF) {
+            // Generate a declaration
+            FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
+            VectorF =
+                Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
+            VectorF->copyAttributesFrom(F);
+          }
+        }
+        assert(VectorF && "Can't create vector function.");
+        Entry[Part] = Builder.CreateCall(VectorF, Args);
       }
+
+      propagateMetadata(Entry, it);
       break;
     }
 
@@ -3463,6 +3626,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
   // Look for the attribute signaling the absence of NaNs.
   Function &F = *Header->getParent();
+  const DataLayout &DL = F.getParent()->getDataLayout();
   if (F.hasFnAttribute("no-nans-fp-math"))
     HasFunNoNaNAttr =
         F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
@@ -3518,9 +3682,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (IK_NoInduction != IK) {
           // Get the widest type.
           if (!WidestIndTy)
-            WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
+            WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
           else
-            WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
+            WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
 
           // Int inductions are special because we only allow one IV.
           if (IK == IK_IntInduction && StepValue->isOne()) {
@@ -3591,13 +3755,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         return false;
       }// end of PHI handling
 
-      // We still don't handle functions. However, we can ignore dbg intrinsic
-      // calls and we do handle certain intrinsic and libm functions.
+      // We handle calls that:
+      //   * Are debug info intrinsics.
+      //   * Have a mapping to an IR intrinsic.
+      //   * Have a vector version available.
       CallInst *CI = dyn_cast<CallInst>(it);
-      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
+      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
+          !(CI->getCalledFunction() && TLI &&
+            TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
         emitAnalysis(VectorizationReport(it) <<
                      "call instruction cannot be vectorized");
-        DEBUG(dbgs() << "LV: Found a call site.\n");
+        DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
         return false;
       }
 
@@ -3665,13 +3833,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
 ///\brief Remove GEPs whose indices but the last one are loop invariant and
 /// return the induction operand of the gep pointer.
-static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
-                                 const DataLayout *DL, Loop *Lp) {
+static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP)
     return Ptr;
 
-  unsigned InductionOperand = getGEPInductionOperand(DL, GEP);
+  unsigned InductionOperand = getGEPInductionOperand(GEP);
 
   // Check that all of the gep indices are uniform except for our induction
   // operand.
@@ -3700,8 +3867,7 @@ static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
 ///\brief Get the stride of a pointer access in a loop.
 /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
 /// pointer to the Value, or null otherwise.
-static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
-                                   const DataLayout *DL, Loop *Lp) {
+static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
   const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
   if (!PtrTy || PtrTy->isAggregateType())
     return nullptr;
@@ -3714,7 +3880,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
   // The size of the pointer access.
   int64_t PtrAccessSize = 1;
 
-  Ptr = stripGetElementPtr(Ptr, SE, DL, Lp);
+  Ptr = stripGetElementPtr(Ptr, SE, Lp);
   const SCEV *V = SE->getSCEV(Ptr);
 
   if (Ptr != OrigPtr)
@@ -3733,7 +3899,8 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
   // Strip off the size of access multiplication if we are still analyzing the
   // pointer.
   if (OrigPtr == Ptr) {
-    DL->getTypeAllocSize(PtrTy->getElementType());
+    const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();
+    DL.getTypeAllocSize(PtrTy->getElementType());
     if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
       if (M->getOperand(0)->getSCEVType() != scConstant)
         return nullptr;
@@ -3785,7 +3952,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   else
     return;
 
-  Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop);
+  Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
   if (!Stride)
     return;
 
@@ -3837,7 +4004,19 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   auto &OptionalReport = LAI->getReport();
   if (OptionalReport)
     emitAnalysis(VectorizationReport(*OptionalReport));
-  return LAI->canVectorizeMemory();
+  if (!LAI->canVectorizeMemory())
+    return false;
+
+  if (LAI->getNumRuntimePointerChecks() >
+      VectorizerParams::RuntimeMemoryCheckThreshold) {
+    emitAnalysis(VectorizationReport()
+                 << LAI->getNumRuntimePointerChecks() << " exceeds limit of "
+                 << VectorizerParams::RuntimeMemoryCheckThreshold
+                 << " dependent memory operations checked at runtime");
+    DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+    return false;
+  }
+  return true;
 }
 
 static bool hasMultipleUsesOf(Instruction *I,
@@ -4163,7 +4342,8 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
   if (!PointerElementType->isSized())
     return IK_NoInduction;
 
-  int64_t Size = static_cast<int64_t>(DL->getTypeAllocSize(PointerElementType));
+  const DataLayout &DL = Phi->getModule()->getDataLayout();
+  int64_t Size = static_cast<int64_t>(DL.getTypeAllocSize(PointerElementType));
   int64_t CVSize = CV->getSExtValue();
   if (CVSize % Size)
     return IK_NoInduction;
@@ -4375,6 +4555,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
 
 unsigned LoopVectorizationCostModel::getWidestType() {
   unsigned MaxWidth = 8;
+  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
 
   // For each block.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
@@ -4409,7 +4590,7 @@ unsigned LoopVectorizationCostModel::getWidestType() {
         continue;
 
       MaxWidth = std::max(MaxWidth,
-                          (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
+                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
     }
   }
 
@@ -4561,6 +4742,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
     return SmallUF;
   }
 
+  // Unroll if this is a large loop (small loops are already dealt with by this
+  // point) that could benefit from interleaved unrolling.
+  bool HasReductions = (Legal->getReductionVars()->size() > 0);
+  if (TTI.enableAggressiveInterleaving(HasReductions)) {
+    DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n");
+    return UF;
+  }
+
   DEBUG(dbgs() << "LV: Not Unrolling.\n");
   return 1;
 }
@@ -4898,8 +5087,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     // Scalarized loads/stores.
     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = ConsecutiveStride < 0;
-    unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
-    unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
+    const DataLayout &DL = I->getModule()->getDataLayout();
+    unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
+    unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
     if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
       bool IsComplexComputation =
         isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
@@ -4960,14 +5150,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
   }
   case Instruction::Call: {
+    bool NeedToScalarize;
     CallInst *CI = cast<CallInst>(I);
-    Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
-    assert(ID && "Not an intrinsic call!");
-    Type *RetTy = ToVectorTy(CI->getType(), VF);
-    SmallVector<Type*, 4> Tys;
-    for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
-      Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
-    return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+    unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
+    if (getIntrinsicIDForCall(CI, TLI))
+      return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
+    return CallCost;
   }
   default: {
     // We are scalarizing the instruction. Return the cost of the scalar
@@ -4994,12 +5182,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   }// end of switch.
 }
 
-Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
-  if (Scalar->isVoidTy() || VF == 1)
-    return Scalar;
-  return VectorType::get(Scalar, VF);
-}
-
 char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index baf9741..8fc4cc1 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -17,9 +17,9 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -342,11 +342,11 @@ public:
   typedef SmallPtrSet<Value *, 16> ValueSet;
   typedef SmallVector<StoreInst *, 8> StoreList;
 
-  BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
-          TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
-          LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC)
+  BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
+          TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+          DominatorTree *Dt, AssumptionCache *AC)
       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
-        SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
+        SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
         Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
   }
@@ -383,7 +383,7 @@ public:
   }
 
   /// \returns true if the memory operations A and B are consecutive.
-  bool isConsecutiveAccess(Value *A, Value *B);
+  bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
 
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
@@ -877,7 +877,6 @@ private:
   // Analysis and block reference.
   Function *F;
   ScalarEvolution *SE;
-  const DataLayout *DL;
   TargetTransformInfo *TTI;
   TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
@@ -1130,8 +1129,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
-        if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
-          if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
+        const DataLayout &DL = F->getParent()->getDataLayout();
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+          if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
             ++NumLoadsWantToChangeOrder;
           }
           BS.cancelScheduling(VL);
@@ -1300,9 +1300,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       return;
     }
     case Instruction::Store: {
+      const DataLayout &DL = F->getParent()->getDataLayout();
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
           BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
@@ -1789,7 +1790,7 @@ unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
   return -1;
 }
 
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
   Value *PtrA = getPointerOperand(A);
   Value *PtrB = getPointerOperand(B);
   unsigned ASA = getAddressSpaceOperand(A);
@@ -1803,13 +1804,13 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
   if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
     return false;
 
-  unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
+  APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
 
   APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
-  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
-  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
   APInt OffsetDelta = OffsetB - OffsetA;
 
@@ -1842,6 +1843,7 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
 void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
                                         SmallVectorImpl<Value *> &Left,
                                         SmallVectorImpl<Value *> &Right) {
+  const DataLayout &DL = F->getParent()->getDataLayout();
 
   // Push left and right operands of binary operation into Left and Right
   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
@@ -1856,10 +1858,10 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
-        if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+        if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
           std::swap(Left[j], Right[j]);
           continue;
-        } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+        } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -1870,10 +1872,10 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
         Instruction *VL1 = cast<Instruction>(VL[j]);
         Instruction *VL2 = cast<Instruction>(VL[j + 1]);
-        if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+        if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
           std::swap(Left[j], Right[j]);
           continue;
-        } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+        } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -1983,6 +1985,8 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
     Right = OrigRight;
   }
 
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
   // Finally check if we can get longer vectorizable chain by reordering
   // without breaking the good operand order detected above.
   // E.g. If we have something like-
@@ -2001,7 +2005,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
   for (unsigned j = 0; j < VL.size() - 1; ++j) {
     if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
-        if (isConsecutiveAccess(L, L1)) {
+        if (isConsecutiveAccess(L, L1, DL)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -2009,7 +2013,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
     }
     if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
       if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
-        if (isConsecutiveAccess(L, L1)) {
+        if (isConsecutiveAccess(L, L1, DL)) {
           std::swap(Left[j + 1], Right[j + 1]);
           continue;
         }
@@ -2105,6 +2109,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     return Gather(E->Scalars, VecTy);
   }
 
+  const DataLayout &DL = F->getParent()->getDataLayout();
   unsigned Opcode = getSameOpcode(E->Scalars);
 
   switch (Opcode) {
@@ -2301,8 +2306,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
-      if (!Alignment)
-        Alignment = DL->getABITypeAlignment(ScalarLoadTy);
+      if (!Alignment) {
+        Alignment = DL.getABITypeAlignment(ScalarLoadTy);
+      }
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
       ++NumVectorInstructions;
@@ -2331,8 +2337,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         ExternalUses.push_back(
             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
 
-      if (!Alignment)
-        Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
+      if (!Alignment) {
+        Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
+      }
       S->setAlignment(Alignment);
       E->VectorizedValue = S;
       ++NumVectorInstructions;
@@ -2358,7 +2365,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         OpVecs.push_back(OpVec);
       }
 
-      Value *V = Builder.CreateGEP(Op0, OpVecs);
+      Value *V = Builder.CreateGEP(
+          cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
       E->VectorizedValue = V;
       ++NumVectorInstructions;
 
@@ -3051,7 +3059,6 @@ struct SLPVectorizer : public FunctionPass {
   }
 
   ScalarEvolution *SE;
-  const DataLayout *DL;
   TargetTransformInfo *TTI;
   TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
@@ -3064,8 +3071,6 @@ struct SLPVectorizer : public FunctionPass {
       return false;
 
     SE = &getAnalysis<ScalarEvolution>();
-    DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-    DL = DLP ? &DLP->getDataLayout() : nullptr;
     TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
     TLI = TLIP ? &TLIP->getTLI() : nullptr;
@@ -3082,11 +3087,6 @@ struct SLPVectorizer : public FunctionPass {
     if (!TTI->getNumberOfRegisters(true))
       return false;
 
-    // Must have DataLayout. We can't require it because some tests run w/o
-    // triple.
-    if (!DL)
-      return false;
-
     // Don't vectorize when the attribute NoImplicitFloat is used.
     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
       return false;
@@ -3095,7 +3095,7 @@ struct SLPVectorizer : public FunctionPass {
 
     // Use the bottom up slp vectorizer to construct chains that start with
     // store instructions.
-    BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC);
+    BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
 
     // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
     // delete instructions.
@@ -3178,15 +3178,11 @@ private:
 /// the WeakVH array.
 /// Vectorization of part of the VL array may cause later values in the VL array
 /// to become invalid. We track when this has happened in the WeakVH array.
-static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
-                               SmallVectorImpl<WeakVH> &VH,
-                               unsigned SliceBegin,
-                               unsigned SliceSize) {
-  for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
-    if (VH[i] != VL[i])
-      return true;
-
-  return false;
+static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
+                               unsigned SliceBegin, unsigned SliceSize) {
+  VL = VL.slice(SliceBegin, SliceSize);
+  VH = VH.slice(SliceBegin, SliceSize);
+  return !std::equal(VL.begin(), VL.end(), VH.begin());
 }
 
 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
@@ -3195,7 +3191,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
         << "\n");
   Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
-  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+  auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
+  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
   unsigned VF = MinVecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
@@ -3238,8 +3235,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
 
 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
                                     int costThreshold, BoUpSLP &R) {
-  SetVector<Value *> Heads, Tails;
-  SmallDenseMap<Value *, Value *> ConsecutiveChain;
+  SetVector<StoreInst *> Heads, Tails;
+  SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
 
   // We may run into multiple chains that merge into a single chain. We mark the
   // stores that we vectorized so that we don't visit the same store twice.
@@ -3252,8 +3249,8 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
     for (unsigned j = 0; j < e; ++j) {
       if (i == j)
         continue;
-
-      if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
+      const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
+      if (R.isConsecutiveAccess(Stores[i], Stores[j], DL)) {
         Tails.insert(Stores[j]);
         Heads.insert(Stores[i]);
         ConsecutiveChain[Stores[i]] = Stores[j];
@@ -3262,7 +3259,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
   }
 
   // For stores that start but don't end a link in the chain:
-  for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
+  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
        it != e; ++it) {
     if (Tails.count(*it))
       continue;
@@ -3270,7 +3267,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
     // We found a store instr that starts a chain. Now follow the chain and try
     // to vectorize it.
     BoUpSLP::ValueList Operands;
-    Value *I = *it;
+    StoreInst *I = *it;
     // Collect the chain into a list.
     while (Tails.count(I) || Heads.count(I)) {
       if (VectorizedStores.count(I))
@@ -3295,6 +3292,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
   unsigned count = 0;
   StoreRefs.clear();
+  const DataLayout &DL = BB->getModule()->getDataLayout();
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     StoreInst *SI = dyn_cast<StoreInst>(it);
     if (!SI)
@@ -3340,9 +3338,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     return false;
 
   unsigned Opcode0 = I0->getOpcode();
+  const DataLayout &DL = I0->getModule()->getDataLayout();
 
   Type *Ty0 = I0->getType();
-  unsigned Sz = DL->getTypeSizeInBits(Ty0);
+  unsigned Sz = DL.getTypeSizeInBits(Ty0);
   unsigned VF = MinVecRegSize / Sz;
 
   for (int i = 0, e = VL.size(); i < e; ++i) {
@@ -3544,8 +3543,7 @@ public:
     ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
 
   /// \brief Try to find a reduction tree.
-  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
-                                 const DataLayout *DL) {
+  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
     assert((!Phi ||
             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
            "Thi phi needs to use the binary operator");
@@ -3570,9 +3568,10 @@ public:
     if (!isValidElementType(Ty))
       return false;
 
+    const DataLayout &DL = B->getModule()->getDataLayout();
     ReductionOpcode = B->getOpcode();
     ReducedValueOpcode = 0;
-    ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
+    ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
     ReductionRoot = B;
     ReductionPHI = Phi;
 
@@ -3882,8 +3881,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
       // Try to match and vectorize a horizontal reduction.
       HorizontalReduction HorRdx;
-      if (ShouldVectorizeHor &&
-          HorRdx.matchAssociativeReduction(P, BI, DL) &&
+      if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
           HorRdx.tryToReduce(R, TTI)) {
         Changed = true;
         it = BB->begin();
@@ -3913,7 +3911,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         if (BinaryOperator *BinOp =
                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
           HorizontalReduction HorRdx;
-          if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
+          if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
                 HorRdx.tryToReduce(R, TTI)) ||
                tryToVectorize(BinOp, R))) {
             Changed = true;
author	Pirama Arumuga Nainar <pirama@google.com>	2015-04-08 08:55:49 -0700
committer	Pirama Arumuga Nainar <pirama@google.com>	2015-04-09 15:04:38 -0700
commit	4c5e43da7792f75567b693105cc53e3f1992ad98 (patch)
tree	1b2c9792582e12f5af0b1512e3094425f0dc0df9 /lib/Transforms
parent	c75239e6119d0f9a74c57099d91cbc9bde56bf33 (diff)
download	external_llvm-4c5e43da7792f75567b693105cc53e3f1992ad98.zip external_llvm-4c5e43da7792f75567b693105cc53e3f1992ad98.tar.gz external_llvm-4c5e43da7792f75567b693105cc53e3f1992ad98.tar.bz2