Update to LLVM 3.5a.

Change-Id: Ifadecab779f128e62e430c2b4f6ddd84953ed617
author: Stephen Hines <srhines@google.com> 2014-04-23 16:57:46 -0700
committer: Stephen Hines <srhines@google.com> 2014-04-24 15:53:16 -0700
commit: 36b56886974eae4f9c5ebc96befd3e7bfe5de338 (patch)
tree: e6cfb69fbbd937f450eeb83bfb83b9da3b01275a /lib/Transforms/Scalar
parent: 69a8640022b04415ae9fac62f8ab090601d8f889 (diff)
download: external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.zip
external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.gz
external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.bz2
40 files changed, 3774 insertions, 3704 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index a3eb07a..fa8b598 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -21,11 +21,11 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/InstIterator.h"
 using namespace llvm;
 
 STATISTIC(NumRemoved, "Number of instructions removed");
@@ -37,9 +37,9 @@ namespace {
       initializeADCEPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnFunction(Function& F);
+    bool runOnFunction(Function& F) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+    void getAnalysisUsage(AnalysisUsage& AU) const override {
       AU.setPreservesCFG();
     }
 
@@ -50,6 +50,9 @@ char ADCE::ID = 0;
 INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false)
 
 bool ADCE::runOnFunction(Function& F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   SmallPtrSet<Instruction*, 128> alive;
   SmallVector<Instruction*, 128> worklist;
 
diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk
index 60308c6..3894f93 100644
--- a/lib/Transforms/Scalar/Android.mk
+++ b/lib/Transforms/Scalar/Android.mk
@@ -2,8 +2,8 @@ LOCAL_PATH:= $(call my-dir)
 
 transforms_scalar_SRC_FILES := \
   ADCE.cpp \
-  CodeGenPrepare.cpp \
   ConstantProp.cpp \
+  ConstantHoisting.cpp \
   CorrelatedValuePropagation.cpp \
   DCE.cpp \
   DeadStoreElimination.cpp \
@@ -30,6 +30,7 @@ transforms_scalar_SRC_FILES := \
   SROA.cpp \
   SampleProfile.cpp \
   Scalar.cpp \
+  Scalarizer.cpp \
   ScalarReplAggregates.cpp \
   SimplifyCFGPass.cpp \
   Sink.cpp \
@@ -53,13 +54,20 @@ include $(BUILD_HOST_STATIC_LIBRARY)
 
 # For the device
 # =====================================================
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(transforms_scalar_SRC_FILES)
 LOCAL_MODULE:= libLLVMScalarOpts
 
+# Override the default optimization level to work around a SIGSEGV
+# on x86 target builds for SROA.cpp.
+# Bug: 8047767
+LOCAL_CFLAGS_x86 += -O1
+
 LOCAL_MODULE_TAGS := optional
 
 include $(LLVM_DEVICE_BUILD_MK)
 include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
+endif
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 626c810..27434c1 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,10 +1,11 @@
 add_llvm_library(LLVMScalarOpts
   ADCE.cpp
-  CodeGenPrepare.cpp
+  ConstantHoisting.cpp
   ConstantProp.cpp
   CorrelatedValuePropagation.cpp
   DCE.cpp
   DeadStoreElimination.cpp
+  Scalarizer.cpp
   EarlyCSE.cpp
   GlobalMerge.cpp
   GVN.cpp
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
deleted file mode 100644
index 007e9b7..0000000
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ /dev/null
@@ -1,2002 +0,0 @@
-//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass munges the code in the input function to better prepare it for
-// SelectionDAG-based code generation. This works around limitations in it's
-// basic-block-at-a-time approach. It should eventually be removed.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "codegenprepare"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/ValueMap.h"
-#include "llvm/Analysis/DominatorInternals.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/ValueHandle.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Transforms/Utils/BypassSlowDivision.h"
-#include "llvm/Transforms/Utils/Local.h"
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-STATISTIC(NumBlocksElim, "Number of blocks eliminated");
-STATISTIC(NumPHIsElim,   "Number of trivial PHIs eliminated");
-STATISTIC(NumGEPsElim,   "Number of GEPs converted to casts");
-STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
-                      "sunken Cmps");
-STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
-                       "of sunken Casts");
-STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
-                          "computations were sunk");
-STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
-STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
-STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
-STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
-STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
-
-static cl::opt<bool> DisableBranchOpts(
-  "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
-  cl::desc("Disable branch optimizations in CodeGenPrepare"));
-
-static cl::opt<bool> DisableSelectToBranch(
-  "disable-cgp-select2branch", cl::Hidden, cl::init(false),
-  cl::desc("Disable select to branch conversion."));
-
-namespace {
-  class CodeGenPrepare : public FunctionPass {
-    /// TLI - Keep a pointer of a TargetLowering to consult for determining
-    /// transformation profitability.
-    const TargetMachine *TM;
-    const TargetLowering *TLI;
-    const TargetLibraryInfo *TLInfo;
-    DominatorTree *DT;
-
-    /// CurInstIterator - As we scan instructions optimizing them, this is the
-    /// next instruction to optimize.  Xforms that can invalidate this should
-    /// update it.
-    BasicBlock::iterator CurInstIterator;
-
-    /// Keeps track of non-local addresses that have been sunk into a block.
-    /// This allows us to avoid inserting duplicate code for blocks with
-    /// multiple load/stores of the same address.
-    ValueMap<Value*, Value*> SunkAddrs;
-
-    /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to
-    /// be updated.
-    bool ModifiedDT;
-
-    /// OptSize - True if optimizing for size.
-    bool OptSize;
-
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit CodeGenPrepare(const TargetMachine *TM = 0)
-      : FunctionPass(ID), TM(TM), TLI(0) {
-        initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
-      }
-    bool runOnFunction(Function &F);
-
-    const char *getPassName() const { return "CodeGen Prepare"; }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addPreserved<DominatorTree>();
-      AU.addRequired<TargetLibraryInfo>();
-    }
-
-  private:
-    bool EliminateFallThrough(Function &F);
-    bool EliminateMostlyEmptyBlocks(Function &F);
-    bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
-    void EliminateMostlyEmptyBlock(BasicBlock *BB);
-    bool OptimizeBlock(BasicBlock &BB);
-    bool OptimizeInst(Instruction *I);
-    bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
-    bool OptimizeInlineAsmInst(CallInst *CS);
-    bool OptimizeCallInst(CallInst *CI);
-    bool MoveExtToFormExtLoad(Instruction *I);
-    bool OptimizeExtUses(Instruction *I);
-    bool OptimizeSelectInst(SelectInst *SI);
-    bool DupRetToEnableTailCallOpts(BasicBlock *BB);
-    bool PlaceDbgValues(Function &F);
-  };
-}
-
-char CodeGenPrepare::ID = 0;
-INITIALIZE_PASS_BEGIN(CodeGenPrepare, "codegenprepare",
-                "Optimize for code generation", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
-INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare",
-                "Optimize for code generation", false, false)
-
-FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
-  return new CodeGenPrepare(TM);
-}
-
-bool CodeGenPrepare::runOnFunction(Function &F) {
-  bool EverMadeChange = false;
-
-  ModifiedDT = false;
-  if (TM) TLI = TM->getTargetLowering();
-  TLInfo = &getAnalysis<TargetLibraryInfo>();
-  DT = getAnalysisIfAvailable<DominatorTree>();
-  OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::OptimizeForSize);
-
-  /// This optimization identifies DIV instructions that can be
-  /// profitably bypassed and carried out with a shorter, faster divide.
-  if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
-    const DenseMap<unsigned int, unsigned int> &BypassWidths =
-       TLI->getBypassSlowDivWidths();
-    for (Function::iterator I = F.begin(); I != F.end(); I++)
-      EverMadeChange |= bypassSlowDivision(F, I, BypassWidths);
-  }
-
-  // Eliminate blocks that contain only PHI nodes and an
-  // unconditional branch.
-  EverMadeChange |= EliminateMostlyEmptyBlocks(F);
-
-  // llvm.dbg.value is far away from the value then iSel may not be able
-  // handle it properly. iSel will drop llvm.dbg.value if it can not
-  // find a node corresponding to the value.
-  EverMadeChange |= PlaceDbgValues(F);
-
-  bool MadeChange = true;
-  while (MadeChange) {
-    MadeChange = false;
-    for (Function::iterator I = F.begin(); I != F.end(); ) {
-      BasicBlock *BB = I++;
-      MadeChange |= OptimizeBlock(*BB);
-    }
-    EverMadeChange |= MadeChange;
-  }
-
-  SunkAddrs.clear();
-
-  if (!DisableBranchOpts) {
-    MadeChange = false;
-    SmallPtrSet<BasicBlock*, 8> WorkList;
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
-      MadeChange |= ConstantFoldTerminator(BB, true);
-      if (!MadeChange) continue;
-
-      for (SmallVectorImpl<BasicBlock*>::iterator
-             II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
-        if (pred_begin(*II) == pred_end(*II))
-          WorkList.insert(*II);
-    }
-
-    // Delete the dead blocks and any of their dead successors.
-    MadeChange |= !WorkList.empty();
-    while (!WorkList.empty()) {
-      BasicBlock *BB = *WorkList.begin();
-      WorkList.erase(BB);
-      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
-
-      DeleteDeadBlock(BB);
-
-      for (SmallVectorImpl<BasicBlock*>::iterator
-             II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
-        if (pred_begin(*II) == pred_end(*II))
-          WorkList.insert(*II);
-    }
-
-    // Merge pairs of basic blocks with unconditional branches, connected by
-    // a single edge.
-    if (EverMadeChange || MadeChange)
-      MadeChange |= EliminateFallThrough(F);
-
-    if (MadeChange)
-      ModifiedDT = true;
-    EverMadeChange |= MadeChange;
-  }
-
-  if (ModifiedDT && DT)
-    DT->DT->recalculate(F);
-
-  return EverMadeChange;
-}
-
-/// EliminateFallThrough - Merge basic blocks which are connected
-/// by a single edge, where one of the basic blocks has a single successor
-/// pointing to the other basic block, which has a single predecessor.
-bool CodeGenPrepare::EliminateFallThrough(Function &F) {
-  bool Changed = false;
-  // Scan all of the blocks in the function, except for the entry block.
-  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) {
-    BasicBlock *BB = I++;
-    // If the destination block has a single pred, then this is a trivial
-    // edge, just collapse it.
-    BasicBlock *SinglePred = BB->getSinglePredecessor();
-
-    // Don't merge if BB's address is taken.
-    if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
-
-    BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
-    if (Term && !Term->isConditional()) {
-      Changed = true;
-      DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n");
-      // Remember if SinglePred was the entry block of the function.
-      // If so, we will need to move BB back to the entry position.
-      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(BB, this);
-
-      if (isEntry && BB != &BB->getParent()->getEntryBlock())
-        BB->moveBefore(&BB->getParent()->getEntryBlock());
-
-      // We have erased a block. Update the iterator.
-      I = BB;
-    }
-  }
-  return Changed;
-}
-
-/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes,
-/// debug info directives, and an unconditional branch.  Passes before isel
-/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for
-/// isel.  Start by eliminating these blocks so we can split them the way we
-/// want them.
-bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
-  bool MadeChange = false;
-  // Note that this intentionally skips the entry block.
-  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) {
-    BasicBlock *BB = I++;
-
-    // If this block doesn't end with an uncond branch, ignore it.
-    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!BI || !BI->isUnconditional())
-      continue;
-
-    // If the instruction before the branch (skipping debug info) isn't a phi
-    // node, then other stuff is happening here.
-    BasicBlock::iterator BBI = BI;
-    if (BBI != BB->begin()) {
-      --BBI;
-      while (isa<DbgInfoIntrinsic>(BBI)) {
-        if (BBI == BB->begin())
-          break;
-        --BBI;
-      }
-      if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
-        continue;
-    }
-
-    // Do not break infinite loops.
-    BasicBlock *DestBB = BI->getSuccessor(0);
-    if (DestBB == BB)
-      continue;
-
-    if (!CanMergeBlocks(BB, DestBB))
-      continue;
-
-    EliminateMostlyEmptyBlock(BB);
-    MadeChange = true;
-  }
-  return MadeChange;
-}
-
-/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a
-/// single uncond branch between them, and BB contains no other non-phi
-/// instructions.
-bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
-                                    const BasicBlock *DestBB) const {
-  // We only want to eliminate blocks whose phi nodes are used by phi nodes in
-  // the successor.  If there are more complex condition (e.g. preheaders),
-  // don't mess around with them.
-  BasicBlock::const_iterator BBI = BB->begin();
-  while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
-    for (Value::const_use_iterator UI = PN->use_begin(), E = PN->use_end();
-         UI != E; ++UI) {
-      const Instruction *User = cast<Instruction>(*UI);
-      if (User->getParent() != DestBB || !isa<PHINode>(User))
-        return false;
-      // If User is inside DestBB block and it is a PHINode then check
-      // incoming value. If incoming value is not from BB then this is
-      // a complex condition (e.g. preheaders) we want to avoid here.
-      if (User->getParent() == DestBB) {
-        if (const PHINode *UPN = dyn_cast<PHINode>(User))
-          for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
-            Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
-            if (Insn && Insn->getParent() == BB &&
-                Insn->getParent() != UPN->getIncomingBlock(I))
-              return false;
-          }
-      }
-    }
-  }
-
-  // If BB and DestBB contain any common predecessors, then the phi nodes in BB
-  // and DestBB may have conflicting incoming values for the block.  If so, we
-  // can't merge the block.
-  const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
-  if (!DestBBPN) return true;  // no conflict.
-
-  // Collect the preds of BB.
-  SmallPtrSet<const BasicBlock*, 16> BBPreds;
-  if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
-    // It is faster to get preds from a PHI than with pred_iterator.
-    for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
-      BBPreds.insert(BBPN->getIncomingBlock(i));
-  } else {
-    BBPreds.insert(pred_begin(BB), pred_end(BB));
-  }
-
-  // Walk the preds of DestBB.
-  for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
-    BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
-    if (BBPreds.count(Pred)) {   // Common predecessor?
-      BBI = DestBB->begin();
-      while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
-        const Value *V1 = PN->getIncomingValueForBlock(Pred);
-        const Value *V2 = PN->getIncomingValueForBlock(BB);
-
-        // If V2 is a phi node in BB, look up what the mapped value will be.
-        if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
-          if (V2PN->getParent() == BB)
-            V2 = V2PN->getIncomingValueForBlock(Pred);
-
-        // If there is a conflict, bail out.
-        if (V1 != V2) return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-
-/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and
-/// an unconditional branch in it.
-void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
-  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
-  BasicBlock *DestBB = BI->getSuccessor(0);
-
-  DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB);
-
-  // If the destination block has a single pred, then this is a trivial edge,
-  // just collapse it.
-  if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
-    if (SinglePred != DestBB) {
-      // Remember if SinglePred was the entry block of the function.  If so, we
-      // will need to move BB back to the entry position.
-      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(DestBB, this);
-
-      if (isEntry && BB != &BB->getParent()->getEntryBlock())
-        BB->moveBefore(&BB->getParent()->getEntryBlock());
-
-      DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
-      return;
-    }
-  }
-
-  // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
-  // to handle the new incoming edges it is about to have.
-  PHINode *PN;
-  for (BasicBlock::iterator BBI = DestBB->begin();
-       (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
-    // Remove the incoming value for BB, and remember it.
-    Value *InVal = PN->removeIncomingValue(BB, false);
-
-    // Two options: either the InVal is a phi node defined in BB or it is some
-    // value that dominates BB.
-    PHINode *InValPhi = dyn_cast<PHINode>(InVal);
-    if (InValPhi && InValPhi->getParent() == BB) {
-      // Add all of the input values of the input PHI as inputs of this phi.
-      for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
-        PN->addIncoming(InValPhi->getIncomingValue(i),
-                        InValPhi->getIncomingBlock(i));
-    } else {
-      // Otherwise, add one instance of the dominating value for each edge that
-      // we will be adding.
-      if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
-        for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
-          PN->addIncoming(InVal, BBPN->getIncomingBlock(i));
-      } else {
-        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-          PN->addIncoming(InVal, *PI);
-      }
-    }
-  }
-
-  // The PHIs are now updated, change everything that refers to BB to use
-  // DestBB and remove BB.
-  BB->replaceAllUsesWith(DestBB);
-  if (DT && !ModifiedDT) {
-    BasicBlock *BBIDom  = DT->getNode(BB)->getIDom()->getBlock();
-    BasicBlock *DestBBIDom = DT->getNode(DestBB)->getIDom()->getBlock();
-    BasicBlock *NewIDom = DT->findNearestCommonDominator(BBIDom, DestBBIDom);
-    DT->changeImmediateDominator(DestBB, NewIDom);
-    DT->eraseNode(BB);
-  }
-  BB->eraseFromParent();
-  ++NumBlocksElim;
-
-  DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
-}
-
-/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
-/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC),
-/// sink it into user blocks to reduce the number of virtual
-/// registers that must be created and coalesced.
-///
-/// Return true if any changes are made.
-///
-static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
-  // If this is a noop copy,
-  EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
-  EVT DstVT = TLI.getValueType(CI->getType());
-
-  // This is an fp<->int conversion?
-  if (SrcVT.isInteger() != DstVT.isInteger())
-    return false;
-
-  // If this is an extension, it will be a zero or sign extension, which
-  // isn't a noop.
-  if (SrcVT.bitsLT(DstVT)) return false;
-
-  // If these values will be promoted, find out what they will be promoted
-  // to.  This helps us consider truncates on PPC as noop copies when they
-  // are.
-  if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
-      TargetLowering::TypePromoteInteger)
-    SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
-  if (TLI.getTypeAction(CI->getContext(), DstVT) ==
-      TargetLowering::TypePromoteInteger)
-    DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
-
-  // If, after promotion, these are the same types, this is a noop copy.
-  if (SrcVT != DstVT)
-    return false;
-
-  BasicBlock *DefBB = CI->getParent();
-
-  /// InsertedCasts - Only insert a cast in each block once.
-  DenseMap<BasicBlock*, CastInst*> InsertedCasts;
-
-  bool MadeChange = false;
-  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end();
-       UI != E; ) {
-    Use &TheUse = UI.getUse();
-    Instruction *User = cast<Instruction>(*UI);
-
-    // Figure out which BB this cast is used in.  For PHI's this is the
-    // appropriate predecessor block.
-    BasicBlock *UserBB = User->getParent();
-    if (PHINode *PN = dyn_cast<PHINode>(User)) {
-      UserBB = PN->getIncomingBlock(UI);
-    }
-
-    // Preincrement use iterator so we don't invalidate it.
-    ++UI;
-
-    // If this user is in the same block as the cast, don't change the cast.
-    if (UserBB == DefBB) continue;
-
-    // If we have already inserted a cast into this block, use it.
-    CastInst *&InsertedCast = InsertedCasts[UserBB];
-
-    if (!InsertedCast) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
-      InsertedCast =
-        CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "",
-                         InsertPt);
-      MadeChange = true;
-    }
-
-    // Replace a use of the cast with a use of the new cast.
-    TheUse = InsertedCast;
-    ++NumCastUses;
-  }
-
-  // If we removed all uses, nuke the cast.
-  if (CI->use_empty()) {
-    CI->eraseFromParent();
-    MadeChange = true;
-  }
-
-  return MadeChange;
-}
-
-/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce
-/// the number of virtual registers that must be created and coalesced.  This is
-/// a clear win except on targets with multiple condition code registers
-///  (PowerPC), where it might lose; some adjustment may be wanted there.
-///
-/// Return true if any changes are made.
-static bool OptimizeCmpExpression(CmpInst *CI) {
-  BasicBlock *DefBB = CI->getParent();
-
-  /// InsertedCmp - Only insert a cmp in each block once.
-  DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
-
-  bool MadeChange = false;
-  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end();
-       UI != E; ) {
-    Use &TheUse = UI.getUse();
-    Instruction *User = cast<Instruction>(*UI);
-
-    // Preincrement use iterator so we don't invalidate it.
-    ++UI;
-
-    // Don't bother for PHI nodes.
-    if (isa<PHINode>(User))
-      continue;
-
-    // Figure out which BB this cmp is used in.
-    BasicBlock *UserBB = User->getParent();
-
-    // If this user is in the same block as the cmp, don't change the cmp.
-    if (UserBB == DefBB) continue;
-
-    // If we have already inserted a cmp into this block, use it.
-    CmpInst *&InsertedCmp = InsertedCmps[UserBB];
-
-    if (!InsertedCmp) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
-      InsertedCmp =
-        CmpInst::Create(CI->getOpcode(),
-                        CI->getPredicate(),  CI->getOperand(0),
-                        CI->getOperand(1), "", InsertPt);
-      MadeChange = true;
-    }
-
-    // Replace a use of the cmp with a use of the new cmp.
-    TheUse = InsertedCmp;
-    ++NumCmpUses;
-  }
-
-  // If we removed all uses, nuke the cmp.
-  if (CI->use_empty())
-    CI->eraseFromParent();
-
-  return MadeChange;
-}
-
-namespace {
-class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
-protected:
-  void replaceCall(Value *With) {
-    CI->replaceAllUsesWith(With);
-    CI->eraseFromParent();
-  }
-  bool isFoldable(unsigned SizeCIOp, unsigned, bool) const {
-      if (ConstantInt *SizeCI =
-                             dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp)))
-        return SizeCI->isAllOnesValue();
-    return false;
-  }
-};
-} // end anonymous namespace
-
-bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
-  BasicBlock *BB = CI->getParent();
-
-  // Lower inline assembly if we can.
-  // If we found an inline asm expession, and if the target knows how to
-  // lower it to normal LLVM code, do so now.
-  if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
-    if (TLI->ExpandInlineAsm(CI)) {
-      // Avoid invalidating the iterator.
-      CurInstIterator = BB->begin();
-      // Avoid processing instructions out of order, which could cause
-      // reuse before a value is defined.
-      SunkAddrs.clear();
-      return true;
-    }
-    // Sink address computing for memory operands into the block.
-    if (OptimizeInlineAsmInst(CI))
-      return true;
-  }
-
-  // Lower all uses of llvm.objectsize.*
-  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
-  if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
-    bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
-    Type *ReturnTy = CI->getType();
-    Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
-
-    // Substituting this can cause recursive simplifications, which can
-    // invalidate our iterator.  Use a WeakVH to hold onto it in case this
-    // happens.
-    WeakVH IterHandle(CurInstIterator);
-
-    replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getDataLayout() : 0,
-                                  TLInfo, ModifiedDT ? 0 : DT);
-
-    // If the iterator instruction was recursively deleted, start over at the
-    // start of the block.
-    if (IterHandle != CurInstIterator) {
-      CurInstIterator = BB->begin();
-      SunkAddrs.clear();
-    }
-    return true;
-  }
-
-  if (II && TLI) {
-    SmallVector<Value*, 2> PtrOps;
-    Type *AccessTy;
-    if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
-      while (!PtrOps.empty())
-        if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
-          return true;
-  }
-
-  // From here on out we're working with named functions.
-  if (CI->getCalledFunction() == 0) return false;
-
-  // We'll need DataLayout from here on out.
-  const DataLayout *TD = TLI ? TLI->getDataLayout() : 0;
-  if (!TD) return false;
-
-  // Lower all default uses of _chk calls.  This is very similar
-  // to what InstCombineCalls does, but here we are only lowering calls
-  // that have the default "don't know" as the objectsize.  Anything else
-  // should be left alone.
-  CodeGenPrepareFortifiedLibCalls Simplifier;
-  return Simplifier.fold(CI, TD, TLInfo);
-}
-
-/// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
-/// instructions to the predecessor to enable tail call optimizations. The
-/// case it is currently looking for is:
-/// @code
-/// bb0:
-///   %tmp0 = tail call i32 @f0()
-///   br label %return
-/// bb1:
-///   %tmp1 = tail call i32 @f1()
-///   br label %return
-/// bb2:
-///   %tmp2 = tail call i32 @f2()
-///   br label %return
-/// return:
-///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
-///   ret i32 %retval
-/// @endcode
-///
-/// =>
-///
-/// @code
-/// bb0:
-///   %tmp0 = tail call i32 @f0()
-///   ret i32 %tmp0
-/// bb1:
-///   %tmp1 = tail call i32 @f1()
-///   ret i32 %tmp1
-/// bb2:
-///   %tmp2 = tail call i32 @f2()
-///   ret i32 %tmp2
-/// @endcode
-bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
-  if (!TLI)
-    return false;
-
-  ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
-  if (!RI)
-    return false;
-
-  PHINode *PN = 0;
-  BitCastInst *BCI = 0;
-  Value *V = RI->getReturnValue();
-  if (V) {
-    BCI = dyn_cast<BitCastInst>(V);
-    if (BCI)
-      V = BCI->getOperand(0);
-
-    PN = dyn_cast<PHINode>(V);
-    if (!PN)
-      return false;
-  }
-
-  if (PN && PN->getParent() != BB)
-    return false;
-
-  // It's not safe to eliminate the sign / zero extension of the return value.
-  // See llvm::isInTailCallPosition().
-  const Function *F = BB->getParent();
-  AttributeSet CallerAttrs = F->getAttributes();
-  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
-      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
-    return false;
-
-  // Make sure there are no instructions between the PHI and return, or that the
-  // return is the first instruction in the block.
-  if (PN) {
-    BasicBlock::iterator BI = BB->begin();
-    do { ++BI; } while (isa<DbgInfoIntrinsic>(BI));
-    if (&*BI == BCI)
-      // Also skip over the bitcast.
-      ++BI;
-    if (&*BI != RI)
-      return false;
-  } else {
-    BasicBlock::iterator BI = BB->begin();
-    while (isa<DbgInfoIntrinsic>(BI)) ++BI;
-    if (&*BI != RI)
-      return false;
-  }
-
-  /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
-  /// call.
-  SmallVector<CallInst*, 4> TailCalls;
-  if (PN) {
-    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
-      CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I));
-      // Make sure the phi value is indeed produced by the tail call.
-      if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
-          TLI->mayBeEmittedAsTailCall(CI))
-        TailCalls.push_back(CI);
-    }
-  } else {
-    SmallPtrSet<BasicBlock*, 4> VisitedBBs;
-    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
-      if (!VisitedBBs.insert(*PI))
-        continue;
-
-      BasicBlock::InstListType &InstList = (*PI)->getInstList();
-      BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
-      BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
-      do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
-      if (RI == RE)
-        continue;
-
-      CallInst *CI = dyn_cast<CallInst>(&*RI);
-      if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI))
-        TailCalls.push_back(CI);
-    }
-  }
-
-  bool Changed = false;
-  for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
-    CallInst *CI = TailCalls[i];
-    CallSite CS(CI);
-
-    // Conservatively require the attributes of the call to match those of the
-    // return. Ignore noalias because it doesn't affect the call sequence.
-    AttributeSet CalleeAttrs = CS.getAttributes();
-    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
-          removeAttribute(Attribute::NoAlias) !=
-        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
-          removeAttribute(Attribute::NoAlias))
-      continue;
-
-    // Make sure the call instruction is followed by an unconditional branch to
-    // the return block.
-    BasicBlock *CallBB = CI->getParent();
-    BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
-    if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
-      continue;
-
-    // Duplicate the return into CallBB.
-    (void)FoldReturnIntoUncondBranch(RI, BB, CallBB);
-    ModifiedDT = Changed = true;
-    ++NumRetsDup;
-  }
-
-  // If we eliminated all predecessors of the block, delete the block now.
-  if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
-    BB->eraseFromParent();
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// Memory Optimization
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode
-/// which holds actual Value*'s for register values.
-struct ExtAddrMode : public TargetLowering::AddrMode {
-  Value *BaseReg;
-  Value *ScaledReg;
-  ExtAddrMode() : BaseReg(0), ScaledReg(0) {}
-  void print(raw_ostream &OS) const;
-  void dump() const;
-
-  bool operator==(const ExtAddrMode& O) const {
-    return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) &&
-           (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) &&
-           (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale);
-  }
-};
-
-#ifndef NDEBUG
-static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
-  AM.print(OS);
-  return OS;
-}
-#endif
-
-void ExtAddrMode::print(raw_ostream &OS) const {
-  bool NeedPlus = false;
-  OS << "[";
-  if (BaseGV) {
-    OS << (NeedPlus ? " + " : "")
-       << "GV:";
-    WriteAsOperand(OS, BaseGV, /*PrintType=*/false);
-    NeedPlus = true;
-  }
-
-  if (BaseOffs)
-    OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true;
-
-  if (BaseReg) {
-    OS << (NeedPlus ? " + " : "")
-       << "Base:";
-    WriteAsOperand(OS, BaseReg, /*PrintType=*/false);
-    NeedPlus = true;
-  }
-  if (Scale) {
-    OS << (NeedPlus ? " + " : "")
-       << Scale << "*";
-    WriteAsOperand(OS, ScaledReg, /*PrintType=*/false);
-  }
-
-  OS << ']';
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void ExtAddrMode::dump() const {
-  print(dbgs());
-  dbgs() << '\n';
-}
-#endif
-
-
-/// \brief A helper class for matching addressing modes.
-///
-/// This encapsulates the logic for matching the target-legal addressing modes.
-class AddressingModeMatcher {
-  SmallVectorImpl<Instruction*> &AddrModeInsts;
-  const TargetLowering &TLI;
-
-  /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
-  /// the memory instruction that we're computing this address for.
-  Type *AccessTy;
-  Instruction *MemoryInst;
-
-  /// AddrMode - This is the addressing mode that we're building up.  This is
-  /// part of the return value of this addressing mode matching stuff.
-  ExtAddrMode &AddrMode;
-
-  /// IgnoreProfitability - This is set to true when we should not do
-  /// profitability checks.  When true, IsProfitableToFoldIntoAddressingMode
-  /// always returns true.
-  bool IgnoreProfitability;
-
-  AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI,
-                        const TargetLowering &T, Type *AT,
-                        Instruction *MI, ExtAddrMode &AM)
-    : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM) {
-    IgnoreProfitability = false;
-  }
-public:
-
-  /// Match - Find the maximal addressing mode that a load/store of V can fold,
-  /// give an access type of AccessTy.  This returns a list of involved
-  /// instructions in AddrModeInsts.
-  static ExtAddrMode Match(Value *V, Type *AccessTy,
-                           Instruction *MemoryInst,
-                           SmallVectorImpl<Instruction*> &AddrModeInsts,
-                           const TargetLowering &TLI) {
-    ExtAddrMode Result;
-
-    bool Success =
-      AddressingModeMatcher(AddrModeInsts, TLI, AccessTy,
-                            MemoryInst, Result).MatchAddr(V, 0);
-    (void)Success; assert(Success && "Couldn't select *anything*?");
-    return Result;
-  }
-private:
-  bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
-  bool MatchAddr(Value *V, unsigned Depth);
-  bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth);
-  bool IsProfitableToFoldIntoAddressingMode(Instruction *I,
-                                            ExtAddrMode &AMBefore,
-                                            ExtAddrMode &AMAfter);
-  bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
-};
-
-/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
-/// Return true and update AddrMode if this addr mode is legal for the target,
-/// false if not.
-bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
-                                             unsigned Depth) {
-  // If Scale is 1, then this is the same as adding ScaleReg to the addressing
-  // mode.  Just process that directly.
-  if (Scale == 1)
-    return MatchAddr(ScaleReg, Depth);
-
-  // If the scale is 0, it takes nothing to add this.
-  if (Scale == 0)
-    return true;
-
-  // If we already have a scale of this value, we can add to it, otherwise, we
-  // need an available scale field.
-  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
-    return false;
-
-  ExtAddrMode TestAddrMode = AddrMode;
-
-  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
-  // [A+B + A*7] -> [B+A*8].
-  TestAddrMode.Scale += Scale;
-  TestAddrMode.ScaledReg = ScaleReg;
-
-  // If the new address isn't legal, bail out.
-  if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy))
-    return false;
-
-  // It was legal, so commit it.
-  AddrMode = TestAddrMode;
-
-  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
-  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
-  // X*Scale + C*Scale to addr mode.
-  ConstantInt *CI = 0; Value *AddLHS = 0;
-  if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
-      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
-    TestAddrMode.ScaledReg = AddLHS;
-    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
-
-    // If this addressing mode is legal, commit it and remember that we folded
-    // this instruction.
-    if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
-      AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
-      AddrMode = TestAddrMode;
-      return true;
-    }
-  }
-
-  // Otherwise, not (x+c)*scale, just return what we have.
-  return true;
-}
-
-/// MightBeFoldableInst - This is a little filter, which returns true if an
-/// addressing computation involving I might be folded into a load/store
-/// accessing it.  This doesn't need to be perfect, but needs to accept at least
-/// the set of instructions that MatchOperationAddr can.
-static bool MightBeFoldableInst(Instruction *I) {
-  switch (I->getOpcode()) {
-  case Instruction::BitCast:
-    // Don't touch identity bitcasts.
-    if (I->getType() == I->getOperand(0)->getType())
-      return false;
-    return I->getType()->isPointerTy() || I->getType()->isIntegerTy();
-  case Instruction::PtrToInt:
-    // PtrToInt is always a noop, as we know that the int type is pointer sized.
-    return true;
-  case Instruction::IntToPtr:
-    // We know the input is intptr_t, so this is foldable.
-    return true;
-  case Instruction::Add:
-    return true;
-  case Instruction::Mul:
-  case Instruction::Shl:
-    // Can only handle X*C and X << C.
-    return isa<ConstantInt>(I->getOperand(1));
-  case Instruction::GetElementPtr:
-    return true;
-  default:
-    return false;
-  }
-}
-
-/// MatchOperationAddr - Given an instruction or constant expr, see if we can
-/// fold the operation into the addressing mode.  If so, update the addressing
-/// mode and return true, otherwise return false without modifying AddrMode.
-bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
-                                               unsigned Depth) {
-  // Avoid exponential behavior on extremely deep expression trees.
-  if (Depth >= 5) return false;
-
-  switch (Opcode) {
-  case Instruction::PtrToInt:
-    // PtrToInt is always a noop, as we know that the int type is pointer sized.
-    return MatchAddr(AddrInst->getOperand(0), Depth);
-  case Instruction::IntToPtr:
-    // This inttoptr is a no-op if the integer type is pointer sized.
-    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
-        TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace()))
-      return MatchAddr(AddrInst->getOperand(0), Depth);
-    return false;
-  case Instruction::BitCast:
-    // BitCast is always a noop, and we can handle it as long as it is
-    // int->int or pointer->pointer (we don't want int<->fp or something).
-    if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
-         AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
-        // Don't touch identity bitcasts.  These were probably put here by LSR,
-        // and we don't want to mess around with them.  Assume it knows what it
-        // is doing.
-        AddrInst->getOperand(0)->getType() != AddrInst->getType())
-      return MatchAddr(AddrInst->getOperand(0), Depth);
-    return false;
-  case Instruction::Add: {
-    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
-    ExtAddrMode BackupAddrMode = AddrMode;
-    unsigned OldSize = AddrModeInsts.size();
-    if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(0), Depth+1))
-      return true;
-
-    // Restore the old addr mode info.
-    AddrMode = BackupAddrMode;
-    AddrModeInsts.resize(OldSize);
-
-    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
-    if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(1), Depth+1))
-      return true;
-
-    // Otherwise we definitely can't merge the ADD in.
-    AddrMode = BackupAddrMode;
-    AddrModeInsts.resize(OldSize);
-    break;
-  }
-  //case Instruction::Or:
-  // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
-  //break;
-  case Instruction::Mul:
-  case Instruction::Shl: {
-    // Can only handle X*C and X << C.
-    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
-    if (!RHS) return false;
-    int64_t Scale = RHS->getSExtValue();
-    if (Opcode == Instruction::Shl)
-      Scale = 1LL << Scale;
-
-    return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
-  }
-  case Instruction::GetElementPtr: {
-    // Scan the GEP.  We check it if it contains constant offsets and at most
-    // one variable offset.
-    int VariableOperand = -1;
-    unsigned VariableScale = 0;
-
-    int64_t ConstantOffset = 0;
-    const DataLayout *TD = TLI.getDataLayout();
-    gep_type_iterator GTI = gep_type_begin(AddrInst);
-    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
-      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        const StructLayout *SL = TD->getStructLayout(STy);
-        unsigned Idx =
-          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
-        ConstantOffset += SL->getElementOffset(Idx);
-      } else {
-        uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType());
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
-          ConstantOffset += CI->getSExtValue()*TypeSize;
-        } else if (TypeSize) {  // Scales of zero don't do anything.
-          // We only allow one variable index at the moment.
-          if (VariableOperand != -1)
-            return false;
-
-          // Remember the variable index.
-          VariableOperand = i;
-          VariableScale = TypeSize;
-        }
-      }
-    }
-
-    // A common case is for the GEP to only do a constant offset.  In this case,
-    // just add it to the disp field and check validity.
-    if (VariableOperand == -1) {
-      AddrMode.BaseOffs += ConstantOffset;
-      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
-        // Check to see if we can fold the base pointer in too.
-        if (MatchAddr(AddrInst->getOperand(0), Depth+1))
-          return true;
-      }
-      AddrMode.BaseOffs -= ConstantOffset;
-      return false;
-    }
-
-    // Save the valid addressing mode in case we can't match.
-    ExtAddrMode BackupAddrMode = AddrMode;
-    unsigned OldSize = AddrModeInsts.size();
-
-    // See if the scale and offset amount is valid for this target.
-    AddrMode.BaseOffs += ConstantOffset;
-
-    // Match the base operand of the GEP.
-    if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) {
-      // If it couldn't be matched, just stuff the value in a register.
-      if (AddrMode.HasBaseReg) {
-        AddrMode = BackupAddrMode;
-        AddrModeInsts.resize(OldSize);
-        return false;
-      }
-      AddrMode.HasBaseReg = true;
-      AddrMode.BaseReg = AddrInst->getOperand(0);
-    }
-
-    // Match the remaining variable portion of the GEP.
-    if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
-                          Depth)) {
-      // If it couldn't be matched, try stuffing the base into a register
-      // instead of matching it, and retrying the match of the scale.
-      AddrMode = BackupAddrMode;
-      AddrModeInsts.resize(OldSize);
-      if (AddrMode.HasBaseReg)
-        return false;
-      AddrMode.HasBaseReg = true;
-      AddrMode.BaseReg = AddrInst->getOperand(0);
-      AddrMode.BaseOffs += ConstantOffset;
-      if (!MatchScaledValue(AddrInst->getOperand(VariableOperand),
-                            VariableScale, Depth)) {
-        // If even that didn't work, bail.
-        AddrMode = BackupAddrMode;
-        AddrModeInsts.resize(OldSize);
-        return false;
-      }
-    }
-
-    return true;
-  }
-  }
-  return false;
-}
-
-/// MatchAddr - If we can, try to add the value of 'Addr' into the current
-/// addressing mode.  If Addr can't be added to AddrMode this returns false and
-/// leaves AddrMode unmodified.  This assumes that Addr is either a pointer type
-/// or intptr_t for the target.
-///
-bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
-    // Fold in immediates if legal for the target.
-    AddrMode.BaseOffs += CI->getSExtValue();
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-      return true;
-    AddrMode.BaseOffs -= CI->getSExtValue();
-  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
-    // If this is a global variable, try to fold it into the addressing mode.
-    if (AddrMode.BaseGV == 0) {
-      AddrMode.BaseGV = GV;
-      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-        return true;
-      AddrMode.BaseGV = 0;
-    }
-  } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
-    ExtAddrMode BackupAddrMode = AddrMode;
-    unsigned OldSize = AddrModeInsts.size();
-
-    // Check to see if it is possible to fold this operation.
-    if (MatchOperationAddr(I, I->getOpcode(), Depth)) {
-      // Okay, it's possible to fold this.  Check to see if it is actually
-      // *profitable* to do so.  We use a simple cost model to avoid increasing
-      // register pressure too much.
-      if (I->hasOneUse() ||
-          IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
-        AddrModeInsts.push_back(I);
-        return true;
-      }
-
-      // It isn't profitable to do this, roll back.
-      //cerr << "NOT FOLDING: " << *I;
-      AddrMode = BackupAddrMode;
-      AddrModeInsts.resize(OldSize);
-    }
-  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
-    if (MatchOperationAddr(CE, CE->getOpcode(), Depth))
-      return true;
-  } else if (isa<ConstantPointerNull>(Addr)) {
-    // Null pointer gets folded without affecting the addressing mode.
-    return true;
-  }
-
-  // Worse case, the target should support [reg] addressing modes. :)
-  if (!AddrMode.HasBaseReg) {
-    AddrMode.HasBaseReg = true;
-    AddrMode.BaseReg = Addr;
-    // Still check for legality in case the target supports [imm] but not [i+r].
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-      return true;
-    AddrMode.HasBaseReg = false;
-    AddrMode.BaseReg = 0;
-  }
-
-  // If the base register is already taken, see if we can do [r+r].
-  if (AddrMode.Scale == 0) {
-    AddrMode.Scale = 1;
-    AddrMode.ScaledReg = Addr;
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-      return true;
-    AddrMode.Scale = 0;
-    AddrMode.ScaledReg = 0;
-  }
-  // Couldn't match.
-  return false;
-}
-
-/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified
-/// inline asm call are due to memory operands.  If so, return true, otherwise
-/// return false.
-static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
-                                    const TargetLowering &TLI) {
-  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
-  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
-    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-
-    // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, SDValue());
-
-    // If this asm operand is our Value*, and if it isn't an indirect memory
-    // operand, we can't fold it!
-    if (OpInfo.CallOperandVal == OpVal &&
-        (OpInfo.ConstraintType != TargetLowering::C_Memory ||
-         !OpInfo.isIndirect))
-      return false;
-  }
-
-  return true;
-}
-
-/// FindAllMemoryUses - Recursively walk all the uses of I until we find a
-/// memory use.  If we find an obviously non-foldable instruction, return true.
-/// Add the ultimately found memory instructions to MemoryUses.
-static bool FindAllMemoryUses(Instruction *I,
-                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
-                              SmallPtrSet<Instruction*, 16> &ConsideredInsts,
-                              const TargetLowering &TLI) {
-  // If we already considered this instruction, we're done.
-  if (!ConsideredInsts.insert(I))
-    return false;
-
-  // If this is an obviously unfoldable instruction, bail out.
-  if (!MightBeFoldableInst(I))
-    return true;
-
-  // Loop over all the uses, recursively processing them.
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-       UI != E; ++UI) {
-    User *U = *UI;
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo()));
-      continue;
-    }
-
-    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      unsigned opNo = UI.getOperandNo();
-      if (opNo == 0) return true; // Storing addr, not into addr.
-      MemoryUses.push_back(std::make_pair(SI, opNo));
-      continue;
-    }
-
-    if (CallInst *CI = dyn_cast<CallInst>(U)) {
-      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
-      if (!IA) return true;
-
-      // If this is a memory operand, we're cool, otherwise bail out.
-      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
-        return true;
-      continue;
-    }
-
-    if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts,
-                          TLI))
-      return true;
-  }
-
-  return false;
-}
-
-/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at
-/// the use site that we're folding it into.  If so, there is no cost to
-/// include it in the addressing mode.  KnownLive1 and KnownLive2 are two values
-/// that we know are live at the instruction already.
-bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
-                                                   Value *KnownLive2) {
-  // If Val is either of the known-live values, we know it is live!
-  if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
-    return true;
-
-  // All values other than instructions and arguments (e.g. constants) are live.
-  if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
-
-  // If Val is a constant sized alloca in the entry block, it is live, this is
-  // true because it is just a reference to the stack/frame pointer, which is
-  // live for the whole function.
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
-    if (AI->isStaticAlloca())
-      return true;
-
-  // Check to see if this value is already used in the memory instruction's
-  // block.  If so, it's already live into the block at the very least, so we
-  // can reasonably fold it.
-  return Val->isUsedInBasicBlock(MemoryInst->getParent());
-}
-
-/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing
-/// mode of the machine to fold the specified instruction into a load or store
-/// that ultimately uses it.  However, the specified instruction has multiple
-/// uses.  Given this, it may actually increase register pressure to fold it
-/// into the load.  For example, consider this code:
-///
-///     X = ...
-///     Y = X+1
-///     use(Y)   -> nonload/store
-///     Z = Y+1
-///     load Z
-///
-/// In this case, Y has multiple uses, and can be folded into the load of Z
-/// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
-/// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
-/// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
-/// number of computations either.
-///
-/// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
-/// X was live across 'load Z' for other reasons, we actually *would* want to
-/// fold the addressing mode in the Z case.  This would make Y die earlier.
-bool AddressingModeMatcher::
-IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
-                                     ExtAddrMode &AMAfter) {
-  if (IgnoreProfitability) return true;
-
-  // AMBefore is the addressing mode before this instruction was folded into it,
-  // and AMAfter is the addressing mode after the instruction was folded.  Get
-  // the set of registers referenced by AMAfter and subtract out those
-  // referenced by AMBefore: this is the set of values which folding in this
-  // address extends the lifetime of.
-  //
-  // Note that there are only two potential values being referenced here,
-  // BaseReg and ScaleReg (global addresses are always available, as are any
-  // folded immediates).
-  Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
-
-  // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
-  // lifetime wasn't extended by adding this instruction.
-  if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
-    BaseReg = 0;
-  if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
-    ScaledReg = 0;
-
-  // If folding this instruction (and it's subexprs) didn't extend any live
-  // ranges, we're ok with it.
-  if (BaseReg == 0 && ScaledReg == 0)
-    return true;
-
-  // If all uses of this instruction are ultimately load/store/inlineasm's,
-  // check to see if their addressing modes will include this instruction.  If
-  // so, we can fold it into all uses, so it doesn't matter if it has multiple
-  // uses.
-  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
-  SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
-    return false;  // Has a non-memory, non-foldable use!
-
-  // Now that we know that all uses of this instruction are part of a chain of
-  // computation involving only operations that could theoretically be folded
-  // into a memory use, loop over each of these uses and see if they could
-  // *actually* fold the instruction.
-  SmallVector<Instruction*, 32> MatchedAddrModeInsts;
-  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
-    Instruction *User = MemoryUses[i].first;
-    unsigned OpNo = MemoryUses[i].second;
-
-    // Get the access type of this use.  If the use isn't a pointer, we don't
-    // know what it accesses.
-    Value *Address = User->getOperand(OpNo);
-    if (!Address->getType()->isPointerTy())
-      return false;
-    Type *AddressAccessTy = Address->getType()->getPointerElementType();
-
-    // Do a match against the root of this address, ignoring profitability. This
-    // will tell us if the addressing mode for the memory operation will
-    // *actually* cover the shared instruction.
-    ExtAddrMode Result;
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
-                                  MemoryInst, Result);
-    Matcher.IgnoreProfitability = true;
-    bool Success = Matcher.MatchAddr(Address, 0);
-    (void)Success; assert(Success && "Couldn't select *anything*?");
-
-    // If the match didn't cover I, then it won't be shared by it.
-    if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
-                  I) == MatchedAddrModeInsts.end())
-      return false;
-
-    MatchedAddrModeInsts.clear();
-  }
-
-  return true;
-}
-
-} // end anonymous namespace
-
-/// IsNonLocalValue - Return true if the specified values are defined in a
-/// different basic block than BB.
-static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    return I->getParent() != BB;
-  return false;
-}
-
-/// OptimizeMemoryInst - Load and Store Instructions often have
-/// addressing modes that can do significant amounts of computation.  As such,
-/// instruction selection will try to get the load or store to do as much
-/// computation as possible for the program.  The problem is that isel can only
-/// see within a single block.  As such, we sink as much legal addressing mode
-/// stuff into the block as possible.
-///
-/// This method is used to optimize both load/store and inline asms with memory
-/// operands.
-bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
-                                        Type *AccessTy) {
-  Value *Repl = Addr;
-
-  // Try to collapse single-value PHI nodes.  This is necessary to undo
-  // unprofitable PRE transformations.
-  SmallVector<Value*, 8> worklist;
-  SmallPtrSet<Value*, 16> Visited;
-  worklist.push_back(Addr);
-
-  // Use a worklist to iteratively look through PHI nodes, and ensure that
-  // the addressing mode obtained from the non-PHI roots of the graph
-  // are equivalent.
-  Value *Consensus = 0;
-  unsigned NumUsesConsensus = 0;
-  bool IsNumUsesConsensusValid = false;
-  SmallVector<Instruction*, 16> AddrModeInsts;
-  ExtAddrMode AddrMode;
-  while (!worklist.empty()) {
-    Value *V = worklist.back();
-    worklist.pop_back();
-
-    // Break use-def graph loops.
-    if (!Visited.insert(V)) {
-      Consensus = 0;
-      break;
-    }
-
-    // For a PHI node, push all of its incoming values.
-    if (PHINode *P = dyn_cast<PHINode>(V)) {
-      for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i)
-        worklist.push_back(P->getIncomingValue(i));
-      continue;
-    }
-
-    // For non-PHIs, determine the addressing mode being computed.
-    SmallVector<Instruction*, 16> NewAddrModeInsts;
-    ExtAddrMode NewAddrMode =
-      AddressingModeMatcher::Match(V, AccessTy, MemoryInst,
-                                   NewAddrModeInsts, *TLI);
-
-    // This check is broken into two cases with very similar code to avoid using
-    // getNumUses() as much as possible. Some values have a lot of uses, so
-    // calling getNumUses() unconditionally caused a significant compile-time
-    // regression.
-    if (!Consensus) {
-      Consensus = V;
-      AddrMode = NewAddrMode;
-      AddrModeInsts = NewAddrModeInsts;
-      continue;
-    } else if (NewAddrMode == AddrMode) {
-      if (!IsNumUsesConsensusValid) {
-        NumUsesConsensus = Consensus->getNumUses();
-        IsNumUsesConsensusValid = true;
-      }
-
-      // Ensure that the obtained addressing mode is equivalent to that obtained
-      // for all other roots of the PHI traversal.  Also, when choosing one
-      // such root as representative, select the one with the most uses in order
-      // to keep the cost modeling heuristics in AddressingModeMatcher
-      // applicable.
-      unsigned NumUses = V->getNumUses();
-      if (NumUses > NumUsesConsensus) {
-        Consensus = V;
-        NumUsesConsensus = NumUses;
-        AddrModeInsts = NewAddrModeInsts;
-      }
-      continue;
-    }
-
-    Consensus = 0;
-    break;
-  }
-
-  // If the addressing mode couldn't be determined, or if multiple different
-  // ones were determined, bail out now.
-  if (!Consensus) return false;
-
-  // Check to see if any of the instructions supersumed by this addr mode are
-  // non-local to I's BB.
-  bool AnyNonLocal = false;
-  for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) {
-    if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) {
-      AnyNonLocal = true;
-      break;
-    }
-  }
-
-  // If all the instructions matched are already in this BB, don't do anything.
-  if (!AnyNonLocal) {
-    DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode << "\n");
-    return false;
-  }
-
-  // Insert this computation right after this user.  Since our caller is
-  // scanning from the top of the BB to the bottom, reuse of the expr are
-  // guaranteed to happen later.
-  IRBuilder<> Builder(MemoryInst);
-
-  // Now that we determined the addressing expression we want to use and know
-  // that we have to sink it into this block.  Check to see if we have already
-  // done this for some other load/store instr in this block.  If so, reuse the
-  // computation.
-  Value *&SunkAddr = SunkAddrs[Addr];
-  if (SunkAddr) {
-    DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
-                 << *MemoryInst);
-    if (SunkAddr->getType() != Addr->getType())
-      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
-  } else {
-    DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
-                 << *MemoryInst);
-    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
-    Value *Result = 0;
-
-    // Start with the base register. Do this first so that subsequent address
-    // matching finds it last, which will prevent it from trying to match it
-    // as the scaled value in case it happens to be a mul. That would be
-    // problematic if we've sunk a different mul for the scale, because then
-    // we'd end up sinking both muls.
-    if (AddrMode.BaseReg) {
-      Value *V = AddrMode.BaseReg;
-      if (V->getType()->isPointerTy())
-        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
-      if (V->getType() != IntPtrTy)
-        V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
-      Result = V;
-    }
-
-    // Add the scale value.
-    if (AddrMode.Scale) {
-      Value *V = AddrMode.ScaledReg;
-      if (V->getType() == IntPtrTy) {
-        // done.
-      } else if (V->getType()->isPointerTy()) {
-        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
-      } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
-                 cast<IntegerType>(V->getType())->getBitWidth()) {
-        V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
-      } else {
-        V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr");
-      }
-      if (AddrMode.Scale != 1)
-        V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
-                              "sunkaddr");
-      if (Result)
-        Result = Builder.CreateAdd(Result, V, "sunkaddr");
-      else
-        Result = V;
-    }
-
-    // Add in the BaseGV if present.
-    if (AddrMode.BaseGV) {
-      Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
-      if (Result)
-        Result = Builder.CreateAdd(Result, V, "sunkaddr");
-      else
-        Result = V;
-    }
-
-    // Add in the Base Offset if present.
-    if (AddrMode.BaseOffs) {
-      Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
-      if (Result)
-        Result = Builder.CreateAdd(Result, V, "sunkaddr");
-      else
-        Result = V;
-    }
-
-    if (Result == 0)
-      SunkAddr = Constant::getNullValue(Addr->getType());
-    else
-      SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
-  }
-
-  MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
-
-  // If we have no uses, recursively delete the value and all dead instructions
-  // using it.
-  if (Repl->use_empty()) {
-    // This can cause recursive deletion, which can invalidate our iterator.
-    // Use a WeakVH to hold onto it in case this happens.
-    WeakVH IterHandle(CurInstIterator);
-    BasicBlock *BB = CurInstIterator->getParent();
-
-    RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
-
-    if (IterHandle != CurInstIterator) {
-      // If the iterator instruction was recursively deleted, start over at the
-      // start of the block.
-      CurInstIterator = BB->begin();
-      SunkAddrs.clear();
-    }
-  }
-  ++NumMemoryInsts;
-  return true;
-}
-
-/// OptimizeInlineAsmInst - If there are any memory operands, use
-/// OptimizeMemoryInst to sink their address computing into the block when
-/// possible / profitable.
-bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
-  bool MadeChange = false;
-
-  TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI->ParseConstraints(CS);
-  unsigned ArgNo = 0;
-  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
-    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-
-    // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, SDValue());
-
-    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
-        OpInfo.isIndirect) {
-      Value *OpVal = CS->getArgOperand(ArgNo++);
-      MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType());
-    } else if (OpInfo.Type == InlineAsm::isInput)
-      ArgNo++;
-  }
-
-  return MadeChange;
-}
-
-/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same
-/// basic block as the load, unless conditions are unfavorable. This allows
-/// SelectionDAG to fold the extend into the load.
-///
-bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) {
-  // Look for a load being extended.
-  LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0));
-  if (!LI) return false;
-
-  // If they're already in the same block, there's nothing to do.
-  if (LI->getParent() == I->getParent())
-    return false;
-
-  // If the load has other users and the truncate is not free, this probably
-  // isn't worthwhile.
-  if (!LI->hasOneUse() &&
-      TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) ||
-              !TLI->isTypeLegal(TLI->getValueType(I->getType()))) &&
-      !TLI->isTruncateFree(I->getType(), LI->getType()))
-    return false;
-
-  // Check whether the target supports casts folded into loads.
-  unsigned LType;
-  if (isa<ZExtInst>(I))
-    LType = ISD::ZEXTLOAD;
-  else {
-    assert(isa<SExtInst>(I) && "Unexpected ext type!");
-    LType = ISD::SEXTLOAD;
-  }
-  if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType())))
-    return false;
-
-  // Move the extend into the same block as the load, so that SelectionDAG
-  // can fold it.
-  I->removeFromParent();
-  I->insertAfter(LI);
-  ++NumExtsMoved;
-  return true;
-}
-
-bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
-  BasicBlock *DefBB = I->getParent();
-
-  // If the result of a {s|z}ext and its source are both live out, rewrite all
-  // other uses of the source with result of extension.
-  Value *Src = I->getOperand(0);
-  if (Src->hasOneUse())
-    return false;
-
-  // Only do this xform if truncating is free.
-  if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
-    return false;
-
-  // Only safe to perform the optimization if the source is also defined in
-  // this block.
-  if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
-    return false;
-
-  bool DefIsLiveOut = false;
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-       UI != E; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
-
-    // Figure out which BB this ext is used in.
-    BasicBlock *UserBB = User->getParent();
-    if (UserBB == DefBB) continue;
-    DefIsLiveOut = true;
-    break;
-  }
-  if (!DefIsLiveOut)
-    return false;
-
-  // Make sure none of the uses are PHI nodes.
-  for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
-       UI != E; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
-    BasicBlock *UserBB = User->getParent();
-    if (UserBB == DefBB) continue;
-    // Be conservative. We don't want this xform to end up introducing
-    // reloads just before load / store instructions.
-    if (isa<PHINode>(User) || isa<LoadInst>(User) || isa<StoreInst>(User))
-      return false;
-  }
-
-  // InsertedTruncs - Only insert one trunc in each block once.
-  DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
-
-  bool MadeChange = false;
-  for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
-       UI != E; ++UI) {
-    Use &TheUse = UI.getUse();
-    Instruction *User = cast<Instruction>(*UI);
-
-    // Figure out which BB this ext is used in.
-    BasicBlock *UserBB = User->getParent();
-    if (UserBB == DefBB) continue;
-
-    // Both src and def are live in this block. Rewrite the use.
-    Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
-
-    if (!InsertedTrunc) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
-      InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
-    }
-
-    // Replace a use of the {s|z}ext source with a use of the result.
-    TheUse = InsertedTrunc;
-    ++NumExtUses;
-    MadeChange = true;
-  }
-
-  return MadeChange;
-}
-
-/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be
-/// turned into an explicit branch.
-static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
-  // FIXME: This should use the same heuristics as IfConversion to determine
-  // whether a select is better represented as a branch.  This requires that
-  // branch probability metadata is preserved for the select, which is not the
-  // case currently.
-
-  CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
-
-  // If the branch is predicted right, an out of order CPU can avoid blocking on
-  // the compare.  Emit cmovs on compares with a memory operand as branches to
-  // avoid stalls on the load from memory.  If the compare has more than one use
-  // there's probably another cmov or setcc around so it's not worth emitting a
-  // branch.
-  if (!Cmp)
-    return false;
-
-  Value *CmpOp0 = Cmp->getOperand(0);
-  Value *CmpOp1 = Cmp->getOperand(1);
-
-  // We check that the memory operand has one use to avoid uses of the loaded
-  // value directly after the compare, making branches unprofitable.
-  return Cmp->hasOneUse() &&
-         ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
-          (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse()));
-}
-
-
-/// If we have a SelectInst that will likely profit from branch prediction,
-/// turn it into a branch.
-bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
-  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
-
-  // Can we convert the 'select' to CF ?
-  if (DisableSelectToBranch || OptSize || !TLI || VectorCond)
-    return false;
-
-  TargetLowering::SelectSupportKind SelectKind;
-  if (VectorCond)
-    SelectKind = TargetLowering::VectorMaskSelect;
-  else if (SI->getType()->isVectorTy())
-    SelectKind = TargetLowering::ScalarCondVectorVal;
-  else
-    SelectKind = TargetLowering::ScalarValSelect;
-
-  // Do we have efficient codegen support for this kind of 'selects' ?
-  if (TLI->isSelectSupported(SelectKind)) {
-    // We have efficient codegen support for the select instruction.
-    // Check if it is profitable to keep this 'select'.
-    if (!TLI->isPredictableSelectExpensive() ||
-        !isFormingBranchFromSelectProfitable(SI))
-      return false;
-  }
-
-  ModifiedDT = true;
-
-  // First, we split the block containing the select into 2 blocks.
-  BasicBlock *StartBlock = SI->getParent();
-  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI));
-  BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
-
-  // Create a new block serving as the landing pad for the branch.
-  BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid",
-                                             NextBlock->getParent(), NextBlock);
-
-  // Move the unconditional branch from the block with the select in it into our
-  // landing pad block.
-  StartBlock->getTerminator()->eraseFromParent();
-  BranchInst::Create(NextBlock, SmallBlock);
-
-  // Insert the real conditional branch based on the original condition.
-  BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI);
-
-  // The select itself is replaced with a PHI Node.
-  PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin());
-  PN->takeName(SI);
-  PN->addIncoming(SI->getTrueValue(), StartBlock);
-  PN->addIncoming(SI->getFalseValue(), SmallBlock);
-  SI->replaceAllUsesWith(PN);
-  SI->eraseFromParent();
-
-  // Instruct OptimizeBlock to skip to the next block.
-  CurInstIterator = StartBlock->end();
-  ++NumSelectsExpanded;
-  return true;
-}
-
-bool CodeGenPrepare::OptimizeInst(Instruction *I) {
-  if (PHINode *P = dyn_cast<PHINode>(I)) {
-    // It is possible for very late stage optimizations (such as SimplifyCFG)
-    // to introduce PHI nodes too late to be cleaned up.  If we detect such a
-    // trivial PHI, go ahead and zap it here.
-    if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0,
-                                       TLInfo, DT)) {
-      P->replaceAllUsesWith(V);
-      P->eraseFromParent();
-      ++NumPHIsElim;
-      return true;
-    }
-    return false;
-  }
-
-  if (CastInst *CI = dyn_cast<CastInst>(I)) {
-    // If the source of the cast is a constant, then this should have
-    // already been constant folded.  The only reason NOT to constant fold
-    // it is if something (e.g. LSR) was careful to place the constant
-    // evaluation in a block other than then one that uses it (e.g. to hoist
-    // the address of globals out of a loop).  If this is the case, we don't
-    // want to forward-subst the cast.
-    if (isa<Constant>(CI->getOperand(0)))
-      return false;
-
-    if (TLI && OptimizeNoopCopyExpression(CI, *TLI))
-      return true;
-
-    if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
-      bool MadeChange = MoveExtToFormExtLoad(I);
-      return MadeChange | OptimizeExtUses(I);
-    }
-    return false;
-  }
-
-  if (CmpInst *CI = dyn_cast<CmpInst>(I))
-    return OptimizeCmpExpression(CI);
-
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (TLI)
-      return OptimizeMemoryInst(I, I->getOperand(0), LI->getType());
-    return false;
-  }
-
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (TLI)
-      return OptimizeMemoryInst(I, SI->getOperand(1),
-                                SI->getOperand(0)->getType());
-    return false;
-  }
-
-  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
-    if (GEPI->hasAllZeroIndices()) {
-      /// The GEP operand must be a pointer, so must its result -> BitCast
-      Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
-                                        GEPI->getName(), GEPI);
-      GEPI->replaceAllUsesWith(NC);
-      GEPI->eraseFromParent();
-      ++NumGEPsElim;
-      OptimizeInst(NC);
-      return true;
-    }
-    return false;
-  }
-
-  if (CallInst *CI = dyn_cast<CallInst>(I))
-    return OptimizeCallInst(CI);
-
-  if (SelectInst *SI = dyn_cast<SelectInst>(I))
-    return OptimizeSelectInst(SI);
-
-  return false;
-}
-
-// In this pass we look for GEP and cast instructions that are used
-// across basic blocks and rewrite them to improve basic-block-at-a-time
-// selection.
-bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
-  SunkAddrs.clear();
-  bool MadeChange = false;
-
-  CurInstIterator = BB.begin();
-  while (CurInstIterator != BB.end())
-    MadeChange |= OptimizeInst(CurInstIterator++);
-
-  MadeChange |= DupRetToEnableTailCallOpts(&BB);
-
-  return MadeChange;
-}
-
-// llvm.dbg.value is far away from the value then iSel may not be able
-// handle it properly. iSel will drop llvm.dbg.value if it can not
-// find a node corresponding to the value.
-bool CodeGenPrepare::PlaceDbgValues(Function &F) {
-  bool MadeChange = false;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    Instruction *PrevNonDbgInst = NULL;
-    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
-      Instruction *Insn = BI; ++BI;
-      DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
-      if (!DVI) {
-        PrevNonDbgInst = Insn;
-        continue;
-      }
-
-      Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
-      if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
-        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
-        DVI->removeFromParent();
-        if (isa<PHINode>(VI))
-          DVI->insertBefore(VI->getParent()->getFirstInsertionPt());
-        else
-          DVI->insertAfter(VI);
-        MadeChange = true;
-        ++NumDbgValueMoved;
-      }
-    }
-  }
-  return MadeChange;
-}
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
new file mode 100644
index 0000000..57a1521
--- /dev/null
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -0,0 +1,590 @@
+//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies expensive constants to hoist and coalesces them to
+// better prepare it for SelectionDAG-based code generation. This works around
+// the limitations of the basic-block-at-a-time approach.
+//
+// First it scans all instructions for integer constants and calculates its
+// cost. If the constant can be folded into the instruction (the cost is
+// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't
+// consider it expensive and leave it alone. This is the default behavior and
+// the default implementation of getIntImmCost will always return TCC_Free.
+//
+// If the cost is more than TCC_BASIC, then the integer constant can't be folded
+// into the instruction and it might be beneficial to hoist the constant.
+// Similar constants are coalesced to reduce register pressure and
+// materialization code.
+//
+// When a constant is hoisted, it is also hidden behind a bitcast to force it to
+// be live-out of the basic block. Otherwise the constant would be just
+// duplicated and each basic block would have its own copy in the SelectionDAG.
+// The SelectionDAG recognizes such constants as opaque and doesn't perform
+// certain transformations on them, which would create a new expensive constant.
+//
+// This optimization is only applied to integer constants in instructions and
+// simple (this means not nested) constant cast expressions. For example:
+// %0 = load i64* inttoptr (i64 big_constant to i64*)
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "consthoist"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
+STATISTIC(NumConstantsRebased, "Number of constants rebased");
+
+namespace {
+struct ConstantUser;
+struct RebasedConstantInfo;
+
+typedef SmallVector<ConstantUser, 8> ConstantUseListType;
+typedef SmallVector<RebasedConstantInfo, 4> RebasedConstantListType;
+
+/// \brief Keeps track of the user of a constant and the operand index where the
+/// constant is used.
+struct ConstantUser {
+  Instruction *Inst;
+  unsigned OpndIdx;
+
+  ConstantUser(Instruction *Inst, unsigned Idx) : Inst(Inst), OpndIdx(Idx) { }
+};
+
+/// \brief Keeps track of a constant candidate and its uses.
+struct ConstantCandidate {
+  ConstantUseListType Uses;
+  ConstantInt *ConstInt;
+  unsigned CumulativeCost;
+
+  ConstantCandidate(ConstantInt *ConstInt)
+    : ConstInt(ConstInt), CumulativeCost(0) { }
+
+  /// \brief Add the user to the use list and update the cost.
+  void addUser(Instruction *Inst, unsigned Idx, unsigned Cost) {
+    CumulativeCost += Cost;
+    Uses.push_back(ConstantUser(Inst, Idx));
+  }
+};
+
+/// \brief This represents a constant that has been rebased with respect to a
+/// base constant. The difference to the base constant is recorded in Offset.
+struct RebasedConstantInfo {
+  ConstantUseListType Uses;
+  Constant *Offset;
+
+  RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset)
+    : Uses(Uses), Offset(Offset) { }
+};
+
+/// \brief A base constant and all its rebased constants.
+struct ConstantInfo {
+  ConstantInt *BaseConstant;
+  RebasedConstantListType RebasedConstants;
+};
+
+/// \brief The constant hoisting pass.
+class ConstantHoisting : public FunctionPass {
+  typedef DenseMap<ConstantInt *, unsigned> ConstCandMapType;
+  typedef std::vector<ConstantCandidate> ConstCandVecType;
+
+  const TargetTransformInfo *TTI;
+  DominatorTree *DT;
+  BasicBlock *Entry;
+
+  /// Keeps track of constant candidates found in the function.
+  ConstCandVecType ConstCandVec;
+
+  /// Keep track of cast instructions we already cloned.
+  SmallDenseMap<Instruction *, Instruction *> ClonedCastMap;
+
+  /// These are the final constants we decided to hoist.
+  SmallVector<ConstantInfo, 8> ConstantVec;
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ConstantHoisting() : FunctionPass(ID), TTI(0), DT(0), Entry(0) {
+    initializeConstantHoistingPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &Fn) override;
+
+  const char *getPassName() const override { return "Constant Hoisting"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfo>();
+  }
+
+private:
+  /// \brief Initialize the pass.
+  void setup(Function &Fn) {
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    TTI = &getAnalysis<TargetTransformInfo>();
+    Entry = &Fn.getEntryBlock();
+  }
+
+  /// \brief Cleanup.
+  void cleanup() {
+    ConstantVec.clear();
+    ClonedCastMap.clear();
+    ConstCandVec.clear();
+
+    TTI = nullptr;
+    DT = nullptr;
+    Entry = nullptr;
+  }
+
+  Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const;
+  Instruction *findConstantInsertionPoint(const ConstantInfo &ConstInfo) const;
+  void collectConstantCandidates(ConstCandMapType &ConstCandMap,
+                                 Instruction *Inst, unsigned Idx,
+                                 ConstantInt *ConstInt);
+  void collectConstantCandidates(ConstCandMapType &ConstCandMap,
+                                 Instruction *Inst);
+  void collectConstantCandidates(Function &Fn);
+  void findAndMakeBaseConstant(ConstCandVecType::iterator S,
+                               ConstCandVecType::iterator E);
+  void findBaseConstants();
+  void emitBaseConstants(Instruction *Base, Constant *Offset,
+                         const ConstantUser &ConstUser);
+  bool emitBaseConstants();
+  void deleteDeadCastInst() const;
+  bool optimizeConstants(Function &Fn);
+};
+}
+
+char ConstantHoisting::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantHoisting, "consthoist", "Constant Hoisting",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(ConstantHoisting, "consthoist", "Constant Hoisting",
+                    false, false)
+
+FunctionPass *llvm::createConstantHoistingPass() {
+  return new ConstantHoisting();
+}
+
+/// \brief Perform the constant hoisting optimization for the given function.
+bool ConstantHoisting::runOnFunction(Function &Fn) {
+  DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+  DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+
+  setup(Fn);
+
+  bool MadeChange = optimizeConstants(Fn);
+
+  if (MadeChange) {
+    DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+                 << Fn.getName() << '\n');
+    DEBUG(dbgs() << Fn);
+  }
+  DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+
+  cleanup();
+
+  return MadeChange;
+}
+
+
+/// \brief Find the constant materialization insertion point.
+Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst,
+                                               unsigned Idx) const {
+  // The simple and common case.
+  if (!isa<PHINode>(Inst) && !isa<LandingPadInst>(Inst))
+    return Inst;
+
+  // We can't insert directly before a phi node or landing pad. Insert before
+  // the terminator of the incoming or dominating block.
+  assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");
+  if (Idx != ~0U && isa<PHINode>(Inst))
+    return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
+
+  BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock();
+  return IDom->getTerminator();
+}
+
+/// \brief Find an insertion point that dominates all uses.
+Instruction *ConstantHoisting::
+findConstantInsertionPoint(const ConstantInfo &ConstInfo) const {
+  assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
+  // Collect all basic blocks.
+  SmallPtrSet<BasicBlock *, 8> BBs;
+  for (auto const &RCI : ConstInfo.RebasedConstants)
+    for (auto const &U : RCI.Uses)
+      BBs.insert(U.Inst->getParent());
+
+  if (BBs.count(Entry))
+    return &Entry->front();
+
+  while (BBs.size() >= 2) {
+    BasicBlock *BB, *BB1, *BB2;
+    BB1 = *BBs.begin();
+    BB2 = *std::next(BBs.begin());
+    BB = DT->findNearestCommonDominator(BB1, BB2);
+    if (BB == Entry)
+      return &Entry->front();
+    BBs.erase(BB1);
+    BBs.erase(BB2);
+    BBs.insert(BB);
+  }
+  assert((BBs.size() == 1) && "Expected only one element.");
+  Instruction &FirstInst = (*BBs.begin())->front();
+  return findMatInsertPt(&FirstInst);
+}
+
+
+/// \brief Record constant integer ConstInt for instruction Inst at operand
+/// index Idx.
+///
+/// The operand at index Idx is not necessarily the constant integer itself. It
+/// could also be a cast instruction or a constant expression that uses the
+// constant integer.
+void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
+                                                 Instruction *Inst,
+                                                 unsigned Idx,
+                                                 ConstantInt *ConstInt) {
+  unsigned Cost;
+  // Ask the target about the cost of materializing the constant for the given
+  // instruction and operand index.
+  if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
+    Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx,
+                              ConstInt->getValue(), ConstInt->getType());
+  else
+    Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(),
+                              ConstInt->getType());
+
+  // Ignore cheap integer constants.
+  if (Cost > TargetTransformInfo::TCC_Basic) {
+    ConstCandMapType::iterator Itr;
+    bool Inserted;
+    std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(ConstInt, 0));
+    if (Inserted) {
+      ConstCandVec.push_back(ConstantCandidate(ConstInt));
+      Itr->second = ConstCandVec.size() - 1;
+    }
+    ConstCandVec[Itr->second].addUser(Inst, Idx, Cost);
+    DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx)))
+            dbgs() << "Collect constant " << *ConstInt << " from " << *Inst
+                   << " with cost " << Cost << '\n';
+          else
+          dbgs() << "Collect constant " << *ConstInt << " indirectly from "
+                 << *Inst << " via " << *Inst->getOperand(Idx) << " with cost "
+                 << Cost << '\n';
+    );
+  }
+}
+
+/// \brief Scan the instruction for expensive integer constants and record them
+/// in the constant candidate vector.
+void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap,
+                                                 Instruction *Inst) {
+  // Skip all cast instructions. They are visited indirectly later on.
+  if (Inst->isCast())
+    return;
+
+  // Can't handle inline asm. Skip it.
+  if (auto Call = dyn_cast<CallInst>(Inst))
+    if (isa<InlineAsm>(Call->getCalledValue()))
+      return;
+
+  // Scan all operands.
+  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+    Value *Opnd = Inst->getOperand(Idx);
+
+    // Visit constant integers.
+    if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      continue;
+    }
+
+    // Visit cast instructions that have constant integers.
+    if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+      // Only visit cast instructions, which have been skipped. All other
+      // instructions should have already been visited.
+      if (!CastInst->isCast())
+        continue;
+
+      if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+        // Pretend the constant is directly used by the instruction and ignore
+        // the cast instruction.
+        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+        continue;
+      }
+    }
+
+    // Visit constant expressions that have constant integers.
+    if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+      // Only visit constant cast expressions.
+      if (!ConstExpr->isCast())
+        continue;
+
+      if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+        // Pretend the constant is directly used by the instruction and ignore
+        // the constant expression.
+        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+        continue;
+      }
+    }
+  } // end of for all operands
+}
+
+/// \brief Collect all integer constants in the function that cannot be folded
+/// into an instruction itself.
+void ConstantHoisting::collectConstantCandidates(Function &Fn) {
+  ConstCandMapType ConstCandMap;
+  for (Function::iterator BB : Fn)
+    for (BasicBlock::iterator Inst : *BB)
+      collectConstantCandidates(ConstCandMap, Inst);
+}
+
+/// \brief Find the base constant within the given range and rebase all other
+/// constants with respect to the base constant.
+void ConstantHoisting::findAndMakeBaseConstant(ConstCandVecType::iterator S,
+                                               ConstCandVecType::iterator E) {
+  auto MaxCostItr = S;
+  unsigned NumUses = 0;
+  // Use the constant that has the maximum cost as base constant.
+  for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+    NumUses += ConstCand->Uses.size();
+    if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
+      MaxCostItr = ConstCand;
+  }
+
+  // Don't hoist constants that have only one use.
+  if (NumUses <= 1)
+    return;
+
+  ConstantInfo ConstInfo;
+  ConstInfo.BaseConstant = MaxCostItr->ConstInt;
+  Type *Ty = ConstInfo.BaseConstant->getType();
+
+  // Rebase the constants with respect to the base constant.
+  for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+    APInt Diff = ConstCand->ConstInt->getValue() -
+                 ConstInfo.BaseConstant->getValue();
+    Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff);
+    ConstInfo.RebasedConstants.push_back(
+      RebasedConstantInfo(std::move(ConstCand->Uses), Offset));
+  }
+  ConstantVec.push_back(ConstInfo);
+}
+
+/// \brief Finds and combines constant candidates that can be easily
+/// rematerialized with an add from a common base constant.
+void ConstantHoisting::findBaseConstants() {
+  // Sort the constants by value and type. This invalidates the mapping!
+  std::sort(ConstCandVec.begin(), ConstCandVec.end(),
+            [](const ConstantCandidate &LHS, const ConstantCandidate &RHS) {
+    if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
+      return LHS.ConstInt->getType()->getBitWidth() <
+             RHS.ConstInt->getType()->getBitWidth();
+    return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue());
+  });
+
+  // Simple linear scan through the sorted constant candidate vector for viable
+  // merge candidates.
+  auto MinValItr = ConstCandVec.begin();
+  for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end();
+       CC != E; ++CC) {
+    if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) {
+      // Check if the constant is in range of an add with immediate.
+      APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue();
+      if ((Diff.getBitWidth() <= 64) &&
+          TTI->isLegalAddImmediate(Diff.getSExtValue()))
+        continue;
+    }
+    // We either have now a different constant type or the constant is not in
+    // range of an add with immediate anymore.
+    findAndMakeBaseConstant(MinValItr, CC);
+    // Start a new base constant search.
+    MinValItr = CC;
+  }
+  // Finalize the last base constant search.
+  findAndMakeBaseConstant(MinValItr, ConstCandVec.end());
+}
+
+/// \brief Updates the operand at Idx in instruction Inst with the result of
+///        instruction Mat. If the instruction is a PHI node then special
+///        handling for duplicate values form the same incomming basic block is
+///        required.
+/// \return The update will always succeed, but the return value indicated if
+///         Mat was used for the update or not.
+static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
+  if (auto PHI = dyn_cast<PHINode>(Inst)) {
+    // Check if any previous operand of the PHI node has the same incoming basic
+    // block. This is a very odd case that happens when the incoming basic block
+    // has a switch statement. In this case use the same value as the previous
+    // operand(s), otherwise we will fail verification due to different values.
+    // The values are actually the same, but the variable names are different
+    // and the verifier doesn't like that.
+    BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx);
+    for (unsigned i = 0; i < Idx; ++i) {
+      if (PHI->getIncomingBlock(i) == IncomingBB) {
+        Value *IncomingVal = PHI->getIncomingValue(i);
+        Inst->setOperand(Idx, IncomingVal);
+        return false;
+      }
+    }
+  }
+
+  Inst->setOperand(Idx, Mat);
+  return true;
+}
+
+/// \brief Emit materialization code for all rebased constants and update their
+/// users.
+void ConstantHoisting::emitBaseConstants(Instruction *Base, Constant *Offset,
+                                         const ConstantUser &ConstUser) {
+  Instruction *Mat = Base;
+  if (Offset) {
+    Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst,
+                                               ConstUser.OpndIdx);
+    Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
+                                 "const_mat", InsertionPt);
+
+    DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+                 << " + " << *Offset << ") in BB "
+                 << Mat->getParent()->getName() << '\n' << *Mat << '\n');
+    Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
+  }
+  Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
+
+  // Visit constant integer.
+  if (isa<ConstantInt>(Opnd)) {
+    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
+      Mat->eraseFromParent();
+    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+
+  // Visit cast instruction.
+  if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+    assert(CastInst->isCast() && "Expected an cast instruction!");
+    // Check if we already have visited this cast instruction before to avoid
+    // unnecessary cloning.
+    Instruction *&ClonedCastInst = ClonedCastMap[CastInst];
+    if (!ClonedCastInst) {
+      ClonedCastInst = CastInst->clone();
+      ClonedCastInst->setOperand(0, Mat);
+      ClonedCastInst->insertAfter(CastInst);
+      // Use the same debug location as the original cast instruction.
+      ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
+      DEBUG(dbgs() << "Clone instruction: " << *ClonedCastInst << '\n'
+                   << "To               : " << *CastInst << '\n');
+    }
+
+    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
+    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+
+  // Visit constant expression.
+  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+    Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+    ConstExprInst->setOperand(0, Mat);
+    ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst,
+                                                ConstUser.OpndIdx));
+
+    // Use the same debug location as the instruction we are about to update.
+    ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
+
+    DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+                 << "From              : " << *ConstExpr << '\n');
+    DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+    if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
+      ConstExprInst->eraseFromParent();
+      if (Offset)
+        Mat->eraseFromParent();
+    }
+    DEBUG(dbgs() << "To    : " << *ConstUser.Inst << '\n');
+    return;
+  }
+}
+
+/// \brief Hoist and hide the base constant behind a bitcast and emit
+/// materialization code for derived constants.
+bool ConstantHoisting::emitBaseConstants() {
+  bool MadeChange = false;
+  for (auto const &ConstInfo : ConstantVec) {
+    // Hoist and hide the base constant behind a bitcast.
+    Instruction *IP = findConstantInsertionPoint(ConstInfo);
+    IntegerType *Ty = ConstInfo.BaseConstant->getType();
+    Instruction *Base =
+      new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
+    DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB "
+                 << IP->getParent()->getName() << '\n' << *Base << '\n');
+    NumConstantsHoisted++;
+
+    // Emit materialization code for all rebased constants.
+    for (auto const &RCI : ConstInfo.RebasedConstants) {
+      NumConstantsRebased++;
+      for (auto const &U : RCI.Uses)
+        emitBaseConstants(Base, RCI.Offset, U);
+    }
+
+    // Use the same debug location as the last user of the constant.
+    assert(!Base->use_empty() && "The use list is empty!?");
+    assert(isa<Instruction>(Base->user_back()) &&
+           "All uses should be instructions.");
+    Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
+
+    // Correct for base constant, which we counted above too.
+    NumConstantsRebased--;
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+/// \brief Check all cast instructions we made a copy of and remove them if they
+/// have no more users.
+void ConstantHoisting::deleteDeadCastInst() const {
+  for (auto const &I : ClonedCastMap)
+    if (I.first->use_empty())
+      I.first->eraseFromParent();
+}
+
+/// \brief Optimize expensive integer constants in the given function.
+bool ConstantHoisting::optimizeConstants(Function &Fn) {
+  // Collect all constant candidates.
+  collectConstantCandidates(Fn);
+
+  // There are no constant candidates to worry about.
+  if (ConstCandVec.empty())
+    return false;
+
+  // Combine constants that can be easily materialized with an add from a common
+  // base constant.
+  findBaseConstants();
+
+  // There are no constants to emit.
+  if (ConstantVec.empty())
+    return false;
+
+  // Finally hoist the base constant and emit materialization code for dependent
+  // constants.
+  bool MadeChange = emitBaseConstants();
+
+  // Cleanup dead instructions.
+  deleteDeadCastInst();
+
+  return MadeChange;
+}
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index d5a96ec..7045b36 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -24,9 +24,9 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/InstIterator.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include <set>
 using namespace llvm;
@@ -40,9 +40,9 @@ namespace {
       initializeConstantPropagationPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<TargetLibraryInfo>();
     }
@@ -67,7 +67,8 @@ bool ConstantPropagation::runOnFunction(Function &F) {
       WorkList.insert(&*i);
   }
   bool Changed = false;
-  DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
   TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
 
   while (!WorkList.empty()) {
@@ -75,12 +76,11 @@ bool ConstantPropagation::runOnFunction(Function &F) {
     WorkList.erase(WorkList.begin());    // Get an element from the worklist...
 
     if (!I->use_empty())                 // Don't muck with dead instructions...
-      if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) {
+      if (Constant *C = ConstantFoldInstruction(I, DL, TLI)) {
         // Add all of the users of this instruction to the worklist, they might
         // be constant propagatable now...
-        for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
-             UI != UE; ++UI)
-          WorkList.insert(cast<Instruction>(*UI));
+        for (User *U : I->users())
+          WorkList.insert(cast<Instruction>(U));
 
         // Replace all of the uses of a variable with uses of the constant.
         I->replaceAllUsesWith(C);
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 995782e..0490767 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -16,11 +16,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -48,9 +48,9 @@ namespace {
      initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LazyValueInfo>();
     }
   };
@@ -281,6 +281,9 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
 }
 
 bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   LVI = &getAnalysis<LazyValueInfo>();
 
   bool FnChanged = false;
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index e8a090a..8377fd9 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -19,9 +19,9 @@
 #define DEBUG_TYPE "dce"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/InstIterator.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -38,7 +38,9 @@ namespace {
     DeadInstElimination() : BasicBlockPass(ID) {
       initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
     }
-    virtual bool runOnBasicBlock(BasicBlock &BB) {
+    bool runOnBasicBlock(BasicBlock &BB) override {
+      if (skipOptnoneFunction(BB))
+        return false;
       TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
@@ -52,7 +54,7 @@ namespace {
       return Changed;
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
     }
   };
@@ -77,9 +79,9 @@ namespace {
       initializeDCEPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
     }
  };
@@ -89,6 +91,9 @@ char DCE::ID = 0;
 INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
 
 bool DCE::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   // Start out with all of the instructions in the worklist...
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 57432c7..f54c00d 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -22,12 +22,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
@@ -53,10 +53,13 @@ namespace {
       initializeDSEPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnFunction(Function &F) {
+    bool runOnFunction(Function &F) override {
+      if (skipOptnoneFunction(F))
+        return false;
+
       AA = &getAnalysis<AliasAnalysis>();
       MD = &getAnalysis<MemoryDependenceAnalysis>();
-      DT = &getAnalysis<DominatorTree>();
+      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
       TLI = AA->getTargetLibraryInfo();
 
       bool Changed = false;
@@ -76,13 +79,13 @@ namespace {
     void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
                                SmallSetVector<Value*, 16> &DeadStackObjects);
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<MemoryDependenceAnalysis>();
       AU.addPreserved<AliasAnalysis>();
-      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addPreserved<MemoryDependenceAnalysis>();
     }
   };
@@ -90,7 +93,7 @@ namespace {
 
 char DSE::ID = 0;
 INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false)
@@ -190,6 +193,7 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) {
 /// describe the memory operations for this instruction.
 static AliasAnalysis::Location
 getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
+  const DataLayout *DL = AA.getDataLayout();
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
     return AA.getLocation(SI);
 
@@ -199,7 +203,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
     // If we don't have target data around, an unknown size in Location means
     // that we should use the size of the pointee type.  This isn't valid for
     // memset/memcpy, which writes more than an i8.
-    if (Loc.Size == AliasAnalysis::UnknownSize && AA.getDataLayout() == 0)
+    if (Loc.Size == AliasAnalysis::UnknownSize && DL == 0)
       return AliasAnalysis::Location();
     return Loc;
   }
@@ -213,7 +217,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
     // If we don't have target data around, an unknown size in Location means
     // that we should use the size of the pointee type.  This isn't valid for
     // init.trampoline, which writes more than an i8.
-    if (AA.getDataLayout() == 0) return AliasAnalysis::Location();
+    if (DL == 0) return AliasAnalysis::Location();
 
     // FIXME: We don't know the size of the trampoline, so we can't really
     // handle it here.
@@ -341,6 +345,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
                                    AliasAnalysis &AA,
                                    int64_t &EarlierOff,
                                    int64_t &LaterOff) {
+  const DataLayout *DL = AA.getDataLayout();
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
 
@@ -354,8 +359,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
       // If we have no DataLayout information around, then the size of the store
       // is inferrable from the pointee type.  If they are the same type, then
       // we know that the store is safe.
-      if (AA.getDataLayout() == 0 &&
-          Later.Ptr->getType() == Earlier.Ptr->getType())
+      if (DL == 0 && Later.Ptr->getType() == Earlier.Ptr->getType())
         return OverwriteComplete;
 
       return OverwriteUnknown;
@@ -369,17 +373,14 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
   // Otherwise, we have to have size information, and the later store has to be
   // larger than the earlier one.
   if (Later.Size == AliasAnalysis::UnknownSize ||
-      Earlier.Size == AliasAnalysis::UnknownSize ||
-      AA.getDataLayout() == 0)
+      Earlier.Size == AliasAnalysis::UnknownSize || DL == 0)
     return OverwriteUnknown;
 
   // Check to see if the later store is to the entire object (either a global,
-  // an alloca, or a byval argument).  If so, then it clearly overwrites any
-  // other store to the same object.
-  const DataLayout *TD = AA.getDataLayout();
-
-  const Value *UO1 = GetUnderlyingObject(P1, TD),
-              *UO2 = GetUnderlyingObject(P2, TD);
+  // an alloca, or a byval/inalloca argument).  If so, then it clearly
+  // overwrites any other store to the same object.
+  const Value *UO1 = GetUnderlyingObject(P1, DL),
+              *UO2 = GetUnderlyingObject(P2, DL);
 
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
@@ -397,8 +398,8 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
   // pointers are equal, then we can reason about the two stores.
   EarlierOff = 0;
   LaterOff = 0;
-  const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD);
-  const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD);
+  const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
+  const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
 
   // If the base pointers still differ, we have two completely different stores.
   if (BP1 != BP2)
@@ -679,7 +680,7 @@ bool DSE::HandleFree(CallInst *F) {
       if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
         break;
 
-      Instruction *Next = llvm::next(BasicBlock::iterator(Dependency));
+      Instruction *Next = std::next(BasicBlock::iterator(Dependency));
 
       // DCE instructions only used to calculate that store
       DeleteDeadInstruction(Dependency, *MD, TLI);
@@ -701,22 +702,6 @@ bool DSE::HandleFree(CallInst *F) {
   return MadeChange;
 }
 
-namespace {
-  struct CouldRef {
-    typedef Value *argument_type;
-    const CallSite CS;
-    AliasAnalysis *AA;
-
-    bool operator()(Value *I) {
-      // See if the call site touches the value.
-      AliasAnalysis::ModRefResult A =
-        AA->getModRefInfo(CS, I, getPointerSize(I, *AA));
-
-      return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref;
-    }
-  };
-}
-
 /// handleEndBlock - Remove dead stores to stack-allocated locations in the
 /// function end block.  Ex:
 /// %A = alloca i32
@@ -742,11 +727,11 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       DeadStackObjects.insert(I);
   }
 
-  // Treat byval arguments the same, stores to them are dead at the end of the
-  // function.
+  // Treat byval or inalloca arguments the same, stores to them are dead at the
+  // end of the function.
   for (Function::arg_iterator AI = BB.getParent()->arg_begin(),
        AE = BB.getParent()->arg_end(); AI != AE; ++AI)
-    if (AI->hasByValAttr())
+    if (AI->hasByValOrInAllocaAttr())
       DeadStackObjects.insert(AI);
 
   // Scan the basic block backwards
@@ -776,7 +761,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
               for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
                    E = Pointers.end(); I != E; ++I) {
                 dbgs() << **I;
-                if (llvm::next(I) != E)
+                if (std::next(I) != E)
                   dbgs() << ", ";
               }
               dbgs() << '\n');
@@ -818,8 +803,13 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
       // If the call might load from any of our allocas, then any store above
       // the call is live.
-      CouldRef Pred = { CS, AA };
-      DeadStackObjects.remove_if(Pred);
+      DeadStackObjects.remove_if([&](Value *I) {
+        // See if the call site touches the value.
+        AliasAnalysis::ModRefResult A =
+            AA->getModRefInfo(CS, I, getPointerSize(I, *AA));
+
+        return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref;
+      });
 
       // If all of the allocas were clobbered by the call then we're not going
       // to find anything else to process.
@@ -862,20 +852,6 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
   return MadeChange;
 }
 
-namespace {
-  struct CouldAlias {
-    typedef Value *argument_type;
-    const AliasAnalysis::Location &LoadedLoc;
-    AliasAnalysis *AA;
-
-    bool operator()(Value *I) {
-      // See if the loaded location could alias the stack location.
-      AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA));
-      return !AA->isNoAlias(StackLoc, LoadedLoc);
-    }
-  };
-}
-
 /// RemoveAccessedObjects - Check to see if the specified location may alias any
 /// of the stack objects in the DeadStackObjects set.  If so, they become live
 /// because the location is being loaded.
@@ -895,6 +871,9 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
   }
 
   // Remove objects that could alias LoadedLoc.
-  CouldAlias Pred = { LoadedLoc, AA };
-  DeadStackObjects.remove_if(Pred);
+  DeadStackObjects.remove_if([&](Value *I) {
+    // See if the loaded location could alias the stack location.
+    AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA));
+    return !AA->isNoAlias(StackLoc, LoadedLoc);
+  });
 }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 5266894..af2c3d1 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -17,16 +17,16 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <deque>
+#include <vector>
 using namespace llvm;
 
 STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
@@ -262,7 +262,7 @@ namespace {
 /// cases.
 class EarlyCSE : public FunctionPass {
 public:
-  const DataLayout *TD;
+  const DataLayout *DL;
   const TargetLibraryInfo *TLI;
   DominatorTree *DT;
   typedef RecyclingAllocator<BumpPtrAllocator,
@@ -303,7 +303,7 @@ public:
     initializeEarlyCSEPass(*PassRegistry::getPassRegistry());
   }
 
-  bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
 private:
 
@@ -376,8 +376,8 @@ private:
   bool processNode(DomTreeNode *Node);
 
   // This transformation requires dominator postdominator info
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<DominatorTree>();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfo>();
     AU.setPreservesCFG();
   }
@@ -392,7 +392,7 @@ FunctionPass *llvm::createEarlyCSEPass() {
 }
 
 INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false)
 
@@ -432,7 +432,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
-    if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) {
+    if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT)) {
       DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
       Inst->replaceAllUsesWith(V);
       Inst->eraseFromParent();
@@ -552,11 +552,15 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
 
 bool EarlyCSE::runOnFunction(Function &F) {
-  std::deque<StackNode *> nodesToProcess;
+  if (skipOptnoneFunction(F))
+    return false;
 
-  TD = getAnalysisIfAvailable<DataLayout>();
+  std::vector<StackNode *> nodesToProcess;
+
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
   TLI = &getAnalysis<TargetLibraryInfo>();
-  DT = &getAnalysis<DominatorTree>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   // Tables that the pass uses when walking the domtree.
   ScopedHTType AVTable;
@@ -570,7 +574,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
   bool Changed = false;
 
   // Process the root node.
-  nodesToProcess.push_front(
+  nodesToProcess.push_back(
       new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
                     CurrentGeneration, DT->getRootNode(),
                     DT->getRootNode()->begin(),
@@ -583,7 +587,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
   while (!nodesToProcess.empty()) {
     // Grab the first item off the stack. Set the current generation, remove
     // the node from the stack, and process it.
-    StackNode *NodeToProcess = nodesToProcess.front();
+    StackNode *NodeToProcess = nodesToProcess.back();
 
     // Initialize class members.
     CurrentGeneration = NodeToProcess->currentGeneration();
@@ -597,7 +601,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
     } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
       // Push the next child onto the stack.
       DomTreeNode *child = NodeToProcess->nextChild();
-      nodesToProcess.push_front(
+      nodesToProcess.push_back(
           new StackNode(AvailableValues,
                         AvailableLoads,
                         AvailableCalls,
@@ -607,7 +611,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
       // It has been processed, and there are no more children to process,
       // so delete it and pop it off the stack.
       delete NodeToProcess;
-      nodesToProcess.pop_front();
+      nodesToProcess.pop_back();
     }
   } // while (!nodes...)
 
diff --git a/lib/Transforms/Scalar/FlattenCFGPass.cpp b/lib/Transforms/Scalar/FlattenCFGPass.cpp
index e7de07f..e7f2564 100644
--- a/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -14,8 +14,8 @@
 #define DEBUG_TYPE "flattencfg"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -26,9 +26,9 @@ public:
   FlattenCFGPass() : FunctionPass(ID) {
     initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
   }
-  bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AliasAnalysis>();
   }
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 6af269d..33c387c 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -20,30 +20,29 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Assembly/Writer.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/PatternMatch.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -587,7 +586,7 @@ namespace {
     bool NoLoads;
     MemoryDependenceAnalysis *MD;
     DominatorTree *DT;
-    const DataLayout *TD;
+    const DataLayout *DL;
     const TargetLibraryInfo *TLI;
     SetVector<BasicBlock *> DeadBlocks;
 
@@ -616,7 +615,7 @@ namespace {
       initializeGVNPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
     /// markInstructionForDeletion - This removes the specified instruction from
     /// our various maps and marks it for deletion.
@@ -625,7 +624,7 @@ namespace {
       InstrsToErase.push_back(I);
     }
 
-    const DataLayout *getDataLayout() const { return TD; }
+    const DataLayout *getDataLayout() const { return DL; }
     DominatorTree &getDominatorTree() const { return *DT; }
     AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
     MemoryDependenceAnalysis &getMemDep() const { return *MD; }
@@ -677,14 +676,14 @@ namespace {
     SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit;
 
     // This transformation requires dominator postdominator info
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<DominatorTree>();
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<TargetLibraryInfo>();
       if (!NoLoads)
         AU.addRequired<MemoryDependenceAnalysis>();
       AU.addRequired<AliasAnalysis>();
 
-      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addPreserved<AliasAnalysis>();
     }
 
@@ -727,7 +726,7 @@ FunctionPass *llvm::createGVNPass(bool NoLoads) {
 
 INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
@@ -818,8 +817,7 @@ SpeculationFailure:
     // Mark as unavailable.
     EntryVal = 0;
 
-    for (succ_iterator I = succ_begin(Entry), E = succ_end(Entry); I != E; ++I)
-      BBWorklist.push_back(*I);
+    BBWorklist.append(succ_begin(Entry), succ_end(Entry));
   } while (!BBWorklist.empty());
 
   return false;
@@ -830,7 +828,7 @@ SpeculationFailure:
 /// CoerceAvailableValueToLoadType will succeed.
 static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
                                             Type *LoadTy,
-                                            const DataLayout &TD) {
+                                            const DataLayout &DL) {
   // If the loaded or stored value is an first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
@@ -839,8 +837,8 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
     return false;
 
   // The store has to be at least as big as the load.
-  if (TD.getTypeSizeInBits(StoredVal->getType()) <
-        TD.getTypeSizeInBits(LoadTy))
+  if (DL.getTypeSizeInBits(StoredVal->getType()) <
+        DL.getTypeSizeInBits(LoadTy))
     return false;
 
   return true;
@@ -855,15 +853,15 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
 static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
                                              Type *LoadedTy,
                                              Instruction *InsertPt,
-                                             const DataLayout &TD) {
-  if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD))
+                                             const DataLayout &DL) {
+  if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL))
     return 0;
 
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
-  uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy);
-  uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy);
+  uint64_t StoreSize = DL.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadedTy);
 
   // If the store and reload are the same size, we can always reuse it.
   if (StoreSize == LoadSize) {
@@ -874,13 +872,13 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
 
     // Convert source pointers to integers, which can be bitcast.
     if (StoredValTy->getScalarType()->isPointerTy()) {
-      StoredValTy = TD.getIntPtrType(StoredValTy);
+      StoredValTy = DL.getIntPtrType(StoredValTy);
       StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
     }
 
     Type *TypeToCastTo = LoadedTy;
     if (TypeToCastTo->getScalarType()->isPointerTy())
-      TypeToCastTo = TD.getIntPtrType(TypeToCastTo);
+      TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
 
     if (StoredValTy != TypeToCastTo)
       StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt);
@@ -899,7 +897,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
 
   // Convert source pointers to integers, which can be manipulated.
   if (StoredValTy->getScalarType()->isPointerTy()) {
-    StoredValTy = TD.getIntPtrType(StoredValTy);
+    StoredValTy = DL.getIntPtrType(StoredValTy);
     StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
   }
 
@@ -911,7 +909,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
 
   // If this is a big-endian system, we need to shift the value down to the low
   // bits so that a truncate will work.
-  if (TD.isBigEndian()) {
+  if (DL.isBigEndian()) {
     Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize);
     StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt);
   }
@@ -942,15 +940,15 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
 static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
-                                          const DataLayout &TD) {
+                                          const DataLayout &DL) {
   // If the loaded or stored value is a first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy())
     return -1;
 
   int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&TD);
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &TD);
+  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&DL);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &DL);
   if (StoreBase != LoadBase)
     return -1;
 
@@ -972,7 +970,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
   // If the load and store don't overlap at all, the store doesn't provide
   // anything to the load.  In this case, they really don't alias at all, AA
   // must have gotten confused.
-  uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy);
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
 
   if ((WriteSizeInBits & 7) | (LoadSize & 7))
     return -1;
@@ -1015,51 +1013,51 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
 /// memdep query of a load that ends up being a clobbering store.
 static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                           StoreInst *DepSI,
-                                          const DataLayout &TD) {
+                                          const DataLayout &DL) {
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepSI->getValueOperand()->getType()->isStructTy() ||
       DepSI->getValueOperand()->getType()->isArrayTy())
     return -1;
 
   Value *StorePtr = DepSI->getPointerOperand();
-  uint64_t StoreSize =TD.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+  uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
   return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                        StorePtr, StoreSize, TD);
+                                        StorePtr, StoreSize, DL);
 }
 
 /// AnalyzeLoadFromClobberingLoad - This function is called when we have a
 /// memdep query of a load that ends up being clobbered by another load.  See if
 /// the other load can feed into the second load.
 static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
-                                         LoadInst *DepLI, const DataLayout &TD){
+                                         LoadInst *DepLI, const DataLayout &DL){
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
     return -1;
 
   Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType());
-  int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD);
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+  int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
   if (R != -1) return R;
 
   // If we have a load/load clobber an DepLI can be widened to cover this load,
   // then we should widen it!
   int64_t LoadOffs = 0;
   const Value *LoadBase =
-    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &TD);
-  unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
+    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &DL);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
 
   unsigned Size = MemoryDependenceAnalysis::
-    getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD);
+    getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, DL);
   if (Size == 0) return -1;
 
-  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD);
+  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
 }
 
 
 
 static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
                                             MemIntrinsic *MI,
-                                            const DataLayout &TD) {
+                                            const DataLayout &DL) {
   // If the mem operation is a non-constant size, we can't handle it.
   ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
   if (SizeCst == 0) return -1;
@@ -1069,7 +1067,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   // of the memset..
   if (MI->getIntrinsicID() == Intrinsic::memset)
     return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
-                                          MemSizeInBits, TD);
+                                          MemSizeInBits, DL);
 
   // If we have a memcpy/memmove, the only case we can handle is if this is a
   // copy from constant memory.  In that case, we can read directly from the
@@ -1079,12 +1077,12 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   Constant *Src = dyn_cast<Constant>(MTI->getSource());
   if (Src == 0) return -1;
 
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD));
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &DL));
   if (GV == 0 || !GV->isConstant()) return -1;
 
   // See if the access is within the bounds of the transfer.
   int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                              MI->getDest(), MemSizeInBits, TD);
+                                              MI->getDest(), MemSizeInBits, DL);
   if (Offset == -1)
     return Offset;
 
@@ -1097,7 +1095,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  if (ConstantFoldLoadFromConstPtr(Src, &TD))
+  if (ConstantFoldLoadFromConstPtr(Src, &DL))
     return Offset;
   return -1;
 }
@@ -1110,11 +1108,11 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
 /// before we give up.
 static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
                                    Type *LoadTy,
-                                   Instruction *InsertPt, const DataLayout &TD){
+                                   Instruction *InsertPt, const DataLayout &DL){
   LLVMContext &Ctx = SrcVal->getType()->getContext();
 
-  uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
-  uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8;
+  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
 
   IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
 
@@ -1122,13 +1120,13 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
   // to an integer type to start with.
   if (SrcVal->getType()->getScalarType()->isPointerTy())
     SrcVal = Builder.CreatePtrToInt(SrcVal,
-        TD.getIntPtrType(SrcVal->getType()));
+        DL.getIntPtrType(SrcVal->getType()));
   if (!SrcVal->getType()->isIntegerTy())
     SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
 
   // Shift the bits to the least significant depending on endianness.
   unsigned ShiftAmt;
-  if (TD.isLittleEndian())
+  if (DL.isLittleEndian())
     ShiftAmt = Offset*8;
   else
     ShiftAmt = (StoreSize-LoadSize-Offset)*8;
@@ -1139,7 +1137,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
   if (LoadSize != StoreSize)
     SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
 
-  return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
+  return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, DL);
 }
 
 /// GetLoadValueForLoad - This function is called when we have a
@@ -1150,11 +1148,11 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
 static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
                                   Type *LoadTy, Instruction *InsertPt,
                                   GVN &gvn) {
-  const DataLayout &TD = *gvn.getDataLayout();
+  const DataLayout &DL = *gvn.getDataLayout();
   // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
   // widen SrcVal out to a larger load.
-  unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
+  unsigned SrcValSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
   if (Offset+LoadSize > SrcValSize) {
     assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
     assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
@@ -1186,7 +1184,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     // Replace uses of the original load with the wider load.  On a big endian
     // system, we need to shift down to get the relevant bits.
     Value *RV = NewLoad;
-    if (TD.isBigEndian())
+    if (DL.isBigEndian())
       RV = Builder.CreateLShr(RV,
                     NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits());
     RV = Builder.CreateTrunc(RV, SrcVal->getType());
@@ -1201,7 +1199,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     SrcVal = NewLoad;
   }
 
-  return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD);
+  return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
 }
 
 
@@ -1209,9 +1207,9 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
 /// memdep query of a load that ends up being a clobbering mem intrinsic.
 static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
                                      Type *LoadTy, Instruction *InsertPt,
-                                     const DataLayout &TD){
+                                     const DataLayout &DL){
   LLVMContext &Ctx = LoadTy->getContext();
-  uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8;
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8;
 
   IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
 
@@ -1242,7 +1240,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
       ++NumBytesSet;
     }
 
-    return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD);
+    return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, DL);
   }
 
   // Otherwise, this is a memcpy/memmove from a constant global.
@@ -1258,7 +1256,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  return ConstantFoldLoadFromConstPtr(Src, &TD);
+  return ConstantFoldLoadFromConstPtr(Src, &DL);
 }
 
 
@@ -1324,10 +1322,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
-      const DataLayout *TD = gvn.getDataLayout();
-      assert(TD && "Need target data to handle type mismatch case");
+      const DataLayout *DL = gvn.getDataLayout();
+      assert(DL && "Need target data to handle type mismatch case");
       Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
-                                 *TD);
+                                 *DL);
   
       DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                    << *getSimpleValue() << '\n'
@@ -1346,10 +1344,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
                    << *Res << '\n' << "\n\n\n");
     }
   } else if (isMemIntrinValue()) {
-    const DataLayout *TD = gvn.getDataLayout();
-    assert(TD && "Need target data to handle type mismatch case");
+    const DataLayout *DL = gvn.getDataLayout();
+    assert(DL && "Need target data to handle type mismatch case");
     Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
-                                 LoadTy, BB->getTerminator(), *TD);
+                                 LoadTy, BB->getTerminator(), *DL);
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
                  << *Res << '\n' << "\n\n\n");
@@ -1402,9 +1400,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       // read by the load, we can extract the bits we need for the load from the
       // stored value.
       if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
-        if (TD && Address) {
+        if (DL && Address) {
           int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address,
-                                                      DepSI, *TD);
+                                                      DepSI, *DL);
           if (Offset != -1) {
             ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
                                                        DepSI->getValueOperand(),
@@ -1421,10 +1419,10 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
         // If this is a clobber and L is the first instruction in its block, then
         // we have the first instruction in the entry block.
-        if (DepLI != LI && Address && TD) {
+        if (DepLI != LI && Address && DL) {
           int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(),
                                                      LI->getPointerOperand(),
-                                                     DepLI, *TD);
+                                                     DepLI, *DL);
 
           if (Offset != -1) {
             ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI,
@@ -1437,9 +1435,9 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       // If the clobbering value is a memset/memcpy/memmove, see if we can
       // forward a value on from it.
       if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
-        if (TD && Address) {
+        if (DL && Address) {
           int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
-                                                        DepMI, *TD);
+                                                        DepMI, *DL);
           if (Offset != -1) {
             ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI,
                                                                   Offset));
@@ -1471,8 +1469,8 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       if (S->getValueOperand()->getType() != LI->getType()) {
         // If the stored value is larger or equal to the loaded value, we can
         // reuse it.
-        if (TD == 0 || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
-                                                        LI->getType(), *TD)) {
+        if (DL == 0 || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+                                                        LI->getType(), *DL)) {
           UnavailableBlocks.push_back(DepBB);
           continue;
         }
@@ -1488,7 +1486,7 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
       if (LD->getType() != LI->getType()) {
         // If the stored value is larger or equal to the loaded value, we can
         // reuse it.
-        if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){
+        if (DL == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*DL)){
           UnavailableBlocks.push_back(DepBB);
           continue;
         }
@@ -1611,7 +1609,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     // If all preds have a single successor, then we know it is safe to insert
     // the load on the pred (?!?), so we can insert code to materialize the
     // pointer if it is not available.
-    PHITransAddr Address(LI->getPointerOperand(), TD);
+    PHITransAddr Address(LI->getPointerOperand(), DL);
     Value *LoadPtr = 0;
     LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
                                                 *DT, NewInsts);
@@ -1712,7 +1710,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
     DEBUG(
       dbgs() << "GVN: non-local load ";
-      WriteAsOperand(dbgs(), LI);
+      LI->printAsOperand(dbgs());
       dbgs() << " has unknown dependencies\n";
     );
     return false;
@@ -1789,7 +1787,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
         ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD));
         break;
       case LLVMContext::MD_prof:
-        llvm_unreachable("MD_prof in a non terminator instruction");
+        llvm_unreachable("MD_prof in a non-terminator instruction");
         break;
       case LLVMContext::MD_fpmath:
         ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD));
@@ -1823,7 +1821,7 @@ bool GVN::processLoad(LoadInst *L) {
 
   // If we have a clobber and target data is around, see if this is a clobber
   // that we can fix up through code synthesis.
-  if (Dep.isClobber() && TD) {
+  if (Dep.isClobber() && DL) {
     // Check to see if we have something like this:
     //   store i32 123, i32* %P
     //   %A = bitcast i32* %P to i8*
@@ -1838,10 +1836,10 @@ bool GVN::processLoad(LoadInst *L) {
     if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) {
       int Offset = AnalyzeLoadFromClobberingStore(L->getType(),
                                                   L->getPointerOperand(),
-                                                  DepSI, *TD);
+                                                  DepSI, *DL);
       if (Offset != -1)
         AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
-                                        L->getType(), L, *TD);
+                                        L->getType(), L, *DL);
     }
 
     // Check to see if we have something like this:
@@ -1856,7 +1854,7 @@ bool GVN::processLoad(LoadInst *L) {
 
       int Offset = AnalyzeLoadFromClobberingLoad(L->getType(),
                                                  L->getPointerOperand(),
-                                                 DepLI, *TD);
+                                                 DepLI, *DL);
       if (Offset != -1)
         AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this);
     }
@@ -1866,9 +1864,9 @@ bool GVN::processLoad(LoadInst *L) {
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
       int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(),
                                                     L->getPointerOperand(),
-                                                    DepMI, *TD);
+                                                    DepMI, *DL);
       if (Offset != -1)
-        AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD);
+        AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *DL);
     }
 
     if (AvailVal) {
@@ -1890,7 +1888,7 @@ bool GVN::processLoad(LoadInst *L) {
     DEBUG(
       // fast print dep, using operator<< on instruction is too slow.
       dbgs() << "GVN: load ";
-      WriteAsOperand(dbgs(), L);
+      L->printAsOperand(dbgs());
       Instruction *I = Dep.getInst();
       dbgs() << " is clobbered by " << *I << '\n';
     );
@@ -1905,7 +1903,7 @@ bool GVN::processLoad(LoadInst *L) {
     DEBUG(
       // fast print dep, using operator<< on instruction is too slow.
       dbgs() << "GVN: load ";
-      WriteAsOperand(dbgs(), L);
+      L->printAsOperand(dbgs());
       dbgs() << " has unknown dependence\n";
     );
     return false;
@@ -1919,9 +1917,9 @@ bool GVN::processLoad(LoadInst *L) {
     // actually have the same type.  See if we know how to reuse the stored
     // value (depending on its type).
     if (StoredVal->getType() != L->getType()) {
-      if (TD) {
+      if (DL) {
         StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(),
-                                                   L, *TD);
+                                                   L, *DL);
         if (StoredVal == 0)
           return false;
 
@@ -1948,9 +1946,9 @@ bool GVN::processLoad(LoadInst *L) {
     // the same type.  See if we know how to reuse the previously loaded value
     // (depending on its type).
     if (DepLI->getType() != L->getType()) {
-      if (TD) {
+      if (DL) {
         AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(),
-                                                      L, *TD);
+                                                      L, *DL);
         if (AvailableVal == 0)
           return false;
 
@@ -2030,7 +2028,7 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
   unsigned Count = 0;
   for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
        UI != UE; ) {
-    Use &U = (UI++).getUse();
+    Use &U = *UI++;
 
     if (DT->dominates(Root, U)) {
       U.set(To);
@@ -2202,7 +2200,7 @@ bool GVN::processInstruction(Instruction *I) {
   // to value numbering it.  Value numbering often exposes redundancies, for
   // example if it determines that %y is equal to %x then the instruction
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
-  if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) {
+  if (Value *V = SimplifyInstruction(I, DL, TLI, DT)) {
     I->replaceAllUsesWith(V);
     if (MD && V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
@@ -2314,10 +2312,14 @@ bool GVN::processInstruction(Instruction *I) {
 
 /// runOnFunction - This is the main transformation entry point for a function.
 bool GVN::runOnFunction(Function& F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   if (!NoLoads)
     MD = &getAnalysis<MemoryDependenceAnalysis>();
-  DT = &getAnalysis<DominatorTree>();
-  TD = getAnalysisIfAvailable<DataLayout>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
   TLI = &getAnalysis<TargetLibraryInfo>();
   VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
   VN.setMemDep(MD);
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index 954e545..8ffd64b 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -70,6 +70,11 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
+cl::opt<bool>
+EnableGlobalMerge("global-merge", cl::Hidden,
+                  cl::desc("Enable global merge pass"),
+                  cl::init(true));
+
 static cl::opt<bool>
 EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
                          cl::desc("Enable global merge pass on constants"),
@@ -107,31 +112,18 @@ namespace {
       initializeGlobalMergePass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-    virtual bool doFinalization(Module &M);
+    bool doInitialization(Module &M) override;
+    bool runOnFunction(Function &F) override;
+    bool doFinalization(Module &M) override;
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Merge internal globals";
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       FunctionPass::getAnalysisUsage(AU);
     }
-
-    struct GlobalCmp {
-      const DataLayout *TD;
-
-      GlobalCmp(const DataLayout *td) : TD(td) { }
-
-      bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) {
-        Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
-        Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
-
-        return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2));
-      }
-    };
   };
 } // end anonymous namespace
 
@@ -143,7 +135,7 @@ INITIALIZE_PASS(GlobalMerge, "global-merge",
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
   const TargetLowering *TLI = TM->getTargetLowering();
-  const DataLayout *TD = TLI->getDataLayout();
+  const DataLayout *DL = TLI->getDataLayout();
 
   // FIXME: Infer the maximum possible offset depending on the actual users
   // (these max offsets are different for the users inside Thumb or ARM
@@ -151,7 +143,13 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
 
   // FIXME: Find better heuristics
-  std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD));
+  std::stable_sort(Globals.begin(), Globals.end(),
+                   [DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
+    Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
+    Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
+
+    return (DL->getTypeAllocSize(Ty1) < DL->getTypeAllocSize(Ty2));
+  });
 
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
 
@@ -162,7 +160,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
     std::vector<Constant*> Inits;
     for (j = i; j != e; ++j) {
       Type *Ty = Globals[j]->getType()->getElementType();
-      MergedSize += TD->getTypeAllocSize(Ty);
+      MergedSize += DL->getTypeAllocSize(Ty);
       if (MergedSize > MaxOffset) {
         break;
       }
@@ -214,7 +212,7 @@ void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
        ++IFn) {
     for (Function::iterator IBB = IFn->begin(), IEndBB = IFn->end();
          IBB != IEndBB; ++IBB) {
-      // Follow the inwoke link to find the landing pad instruction
+      // Follow the invoke link to find the landing pad instruction
       const InvokeInst *II = dyn_cast<InvokeInst>(IBB->getTerminator());
       if (!II) continue;
 
@@ -231,10 +229,13 @@ void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
 }
 
 bool GlobalMerge::doInitialization(Module &M) {
+  if (!EnableGlobalMerge)
+    return false;
+
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
                                                         BSSGlobals;
   const TargetLowering *TLI = TM->getTargetLowering();
-  const DataLayout *TD = TLI->getDataLayout();
+  const DataLayout *DL = TLI->getDataLayout();
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
   setMustKeepGlobalVariables(M);
@@ -252,9 +253,9 @@ bool GlobalMerge::doInitialization(Module &M) {
     unsigned AddressSpace = PT->getAddressSpace();
 
     // Ignore fancy-aligned globals for now.
-    unsigned Alignment = TD->getPreferredAlignment(I);
+    unsigned Alignment = DL->getPreferredAlignment(I);
     Type *Ty = I->getType()->getElementType();
-    if (Alignment > TD->getABITypeAlignment(Ty))
+    if (Alignment > DL->getABITypeAlignment(Ty))
       continue;
 
     // Ignore all 'special' globals.
@@ -266,7 +267,7 @@ bool GlobalMerge::doInitialization(Module &M) {
     if (isMustKeepGlobalVariable(I))
       continue;
 
-    if (TD->getTypeAllocSize(Ty) < MaxOffset) {
+    if (DL->getTypeAllocSize(Ty) < MaxOffset) {
       if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine())
           .isBSSLocal())
         BSSGlobals[AddressSpace].push_back(I);
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 235aaaa..7537632 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -29,18 +29,18 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -63,12 +63,15 @@ static cl::opt<bool> VerifyIndvars(
   "verify-indvars", cl::Hidden,
   cl::desc("Verify the ScalarEvolution result after running indvars"));
 
+static cl::opt<bool> ReduceLiveIVs("liv-reduce", cl::Hidden,
+  cl::desc("Reduce live induction variables."));
+
 namespace {
   class IndVarSimplify : public LoopPass {
     LoopInfo        *LI;
     ScalarEvolution *SE;
     DominatorTree   *DT;
-    DataLayout      *TD;
+    const DataLayout *DL;
     TargetLibraryInfo *TLI;
 
     SmallVector<WeakVH, 16> DeadInsts;
@@ -76,15 +79,15 @@ namespace {
   public:
 
     static char ID; // Pass identification, replacement for typeid
-    IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), TD(0),
+    IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), DL(0),
                        Changed(false) {
       initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<DominatorTree>();
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfo>();
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
@@ -96,7 +99,7 @@ namespace {
     }
 
   private:
-    virtual void releaseMemory() {
+    void releaseMemory() override {
       DeadInsts.clear();
     }
 
@@ -119,7 +122,7 @@ namespace {
 char IndVarSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
                 "Induction Variable Simplification", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
@@ -266,11 +269,11 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
 
   // Check Incr uses. One user is PN and the other user is an exit condition
   // used by the conditional terminator.
-  Value::use_iterator IncrUse = Incr->use_begin();
+  Value::user_iterator IncrUse = Incr->user_begin();
   Instruction *U1 = cast<Instruction>(*IncrUse++);
-  if (IncrUse == Incr->use_end()) return;
+  if (IncrUse == Incr->user_end()) return;
   Instruction *U2 = cast<Instruction>(*IncrUse++);
-  if (IncrUse != Incr->use_end()) return;
+  if (IncrUse != Incr->user_end()) return;
 
   // Find exit condition, which is an fcmp.  If it doesn't exist, or if it isn't
   // only used by a branch, we can't transform it.
@@ -278,10 +281,10 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   if (!Compare)
     Compare = dyn_cast<FCmpInst>(U2);
   if (Compare == 0 || !Compare->hasOneUse() ||
-      !isa<BranchInst>(Compare->use_back()))
+      !isa<BranchInst>(Compare->user_back()))
     return;
 
-  BranchInst *TheBr = cast<BranchInst>(Compare->use_back());
+  BranchInst *TheBr = cast<BranchInst>(Compare->user_back());
 
   // We need to verify that the branch actually controls the iteration count
   // of the loop.  If not, the new IV can overflow and no one will notice.
@@ -494,6 +497,21 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 
     unsigned NumPreds = PN->getNumIncomingValues();
 
+    // We would like to be able to RAUW single-incoming value PHI nodes. We
+    // have to be certain this is safe even when this is an LCSSA PHI node.
+    // While the computed exit value is no longer varying in *this* loop, the
+    // exit block may be an exit block for an outer containing loop as well,
+    // the exit value may be varying in the outer loop, and thus it may still
+    // require an LCSSA PHI node. The safe case is when this is
+    // single-predecessor PHI node (LCSSA) and the exit block containing it is
+    // part of the enclosing loop, or this is the outer most loop of the nest.
+    // In either case the exit value could (at most) be varying in the same
+    // loop body as the phi node itself. Thus if it is in turn used outside of
+    // an enclosing loop it will only be via a separate LCSSA node.
+    bool LCSSASafePhiForRAUW =
+        NumPreds == 1 &&
+        (!L->getParentLoop() || L->getParentLoop() == LI->getLoopFor(ExitBB));
+
     // Iterate over all of the PHI nodes.
     BasicBlock::iterator BBI = ExitBB->begin();
     while ((PN = dyn_cast<PHINode>(BBI++))) {
@@ -545,8 +563,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
           unsigned NumHardInternalUses = 0;
           unsigned NumSoftExternalUses = 0;
           unsigned NumUses = 0;
-          for (Value::use_iterator IB=Inst->use_begin(), IE=Inst->use_end();
-               IB!=IE && NumUses<=6 ; ++IB) {
+          for (auto IB = Inst->user_begin(), IE = Inst->user_end();
+               IB != IE && NumUses <= 6; ++IB) {
             Instruction *UseInstr = cast<Instruction>(*IB);
             unsigned Opc = UseInstr->getOpcode();
             NumUses++;
@@ -558,9 +576,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
                 // Do not count the Phi as a use. LCSSA may have inserted
                 // plenty of trivial ones.
                 NumUses--;
-                for (Value::use_iterator PB=UseInstr->use_begin(),
-                                         PE=UseInstr->use_end();
-                     PB!=PE && NumUses<=6 ; ++PB, ++NumUses) {
+                for (auto PB = UseInstr->user_begin(),
+                          PE = UseInstr->user_end();
+                     PB != PE && NumUses <= 6; ++PB, ++NumUses) {
                   unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode();
                   if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret)
                     NumSoftExternalUses++;
@@ -594,17 +612,18 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         if (isInstructionTriviallyDead(Inst, TLI))
           DeadInsts.push_back(Inst);
 
-        if (NumPreds == 1) {
-          // Completely replace a single-pred PHI. This is safe, because the
-          // NewVal won't be variant in the loop, so we don't need an LCSSA phi
-          // node anymore.
+        // If we determined that this PHI is safe to replace even if an LCSSA
+        // PHI, do so.
+        if (LCSSASafePhiForRAUW) {
           PN->replaceAllUsesWith(ExitVal);
           PN->eraseFromParent();
         }
       }
-      if (NumPreds != 1) {
-        // Clone the PHI and delete the original one. This lets IVUsers and
-        // any other maps purge the original user from their records.
+
+      // If we were unable to completely replace the PHI node, clone the PHI
+      // and delete the original one. This lets IVUsers and any other maps
+      // purge the original user from their records.
+      if (!LCSSASafePhiForRAUW) {
         PHINode *NewPN = cast<PHINode>(PN->clone());
         NewPN->takeName(PN);
         NewPN->insertBefore(PN);
@@ -634,34 +653,20 @@ namespace {
 
     WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {}
   };
-
-  class WideIVVisitor : public IVVisitor {
-    ScalarEvolution *SE;
-    const DataLayout *TD;
-
-  public:
-    WideIVInfo WI;
-
-    WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV,
-                  const DataLayout *TData) :
-      SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; }
-
-    // Implement the interface used by simplifyUsersOfIV.
-    virtual void visitCast(CastInst *Cast);
-  };
 }
 
 /// visitCast - Update information about the induction variable that is
 /// extended by this sign or zero extend operation. This is used to determine
 /// the final width of the IV before actually widening it.
-void WideIVVisitor::visitCast(CastInst *Cast) {
+static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
+                        const DataLayout *DL) {
   bool IsSigned = Cast->getOpcode() == Instruction::SExt;
   if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
     return;
 
   Type *Ty = Cast->getType();
   uint64_t Width = SE->getTypeSizeInBits(Ty);
-  if (TD && !TD->isLegalInteger(Width))
+  if (DL && !DL->isLegalInteger(Width))
     return;
 
   if (!WI.WidestNativeType) {
@@ -891,15 +896,43 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
   return AddRec;
 }
 
+/// This IV user cannot be widen. Replace this use of the original narrow IV
+/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
+static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) {
+  DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef
+        << " for user " << *DU.NarrowUse << "\n");
+  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+  Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
+}
+
 /// WidenIVUse - Determine whether an individual user of the narrow IV can be
 /// widened. If so, return the wide clone of the user.
 Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
 
   // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
-  if (isa<PHINode>(DU.NarrowUse) &&
-      LI->getLoopFor(DU.NarrowUse->getParent()) != L)
-    return 0;
-
+  if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
+    if (LI->getLoopFor(UsePhi->getParent()) != L) {
+      // For LCSSA phis, sink the truncate outside the loop.
+      // After SimplifyCFG most loop exit targets have a single predecessor.
+      // Otherwise fall back to a truncate within the loop.
+      if (UsePhi->getNumOperands() != 1)
+        truncateIVUse(DU, DT);
+      else {
+        PHINode *WidePhi =
+          PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide",
+                          UsePhi);
+        WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
+        IRBuilder<> Builder(WidePhi->getParent()->getFirstInsertionPt());
+        Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
+        UsePhi->replaceAllUsesWith(Trunc);
+        DeadInsts.push_back(UsePhi);
+        DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi
+              << " to " << *WidePhi << "\n");
+      }
+      return 0;
+    }
+  }
   // Our raison d'etre! Eliminate sign and zero extension.
   if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) {
     Value *NewDef = DU.WideDef;
@@ -947,9 +980,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     // This user does not evaluate to a recurence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
-    IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
-    Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
-    DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
+    truncateIVUse(DU, DT);
     return 0;
   }
   // Assume block terminators cannot evaluate to a recurrence. We can't to
@@ -987,15 +1018,14 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
 /// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers.
 ///
 void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
-  for (Value::use_iterator UI = NarrowDef->use_begin(),
-         UE = NarrowDef->use_end(); UI != UE; ++UI) {
-    Instruction *NarrowUse = cast<Instruction>(*UI);
+  for (User *U : NarrowDef->users()) {
+    Instruction *NarrowUser = cast<Instruction>(U);
 
     // Handle data flow merges and bizarre phi cycles.
-    if (!Widened.insert(NarrowUse))
+    if (!Widened.insert(NarrowUser))
       continue;
 
-    NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUse, WideDef));
+    NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef));
   }
 }
 
@@ -1080,9 +1110,36 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
 }
 
 //===----------------------------------------------------------------------===//
+//  Live IV Reduction - Minimize IVs live across the loop.
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
 //  Simplification of IV users based on SCEV evaluation.
 //===----------------------------------------------------------------------===//
 
+namespace {
+  class IndVarSimplifyVisitor : public IVVisitor {
+    ScalarEvolution *SE;
+    const DataLayout *DL;
+    PHINode *IVPhi;
+
+  public:
+    WideIVInfo WI;
+
+    IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
+                          const DataLayout *DL, const DominatorTree *DTree):
+      SE(SCEV), DL(DL), IVPhi(IV) {
+      DT = DTree;
+      WI.NarrowIV = IVPhi;
+      if (ReduceLiveIVs)
+        setSplitOverflowIntrinsics();
+    }
+
+    // Implement the interface used by simplifyUsersOfIV.
+    void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, DL); }
+  };
+}
 
 /// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV
 /// users. Each successive simplification may push more users which may
@@ -1114,12 +1171,12 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L,
       PHINode *CurrIV = LoopPhis.pop_back_val();
 
       // Information about sign/zero extensions of CurrIV.
-      WideIVVisitor WIV(CurrIV, SE, TD);
+      IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, DT);
 
-      Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV);
+      Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor);
 
-      if (WIV.WI.WidestNativeType) {
-        WideIVs.push_back(WIV.WI);
+      if (Visitor.WI.WidestNativeType) {
+        WideIVs.push_back(Visitor.WI);
       }
     } while(!LoopPhis.empty());
 
@@ -1359,15 +1416,11 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
   int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
   Value *IncV = Phi->getIncomingValue(LatchIdx);
 
-  for (Value::use_iterator UI = Phi->use_begin(), UE = Phi->use_end();
-       UI != UE; ++UI) {
-    if (*UI != Cond && *UI != IncV) return false;
-  }
+  for (User *U : Phi->users())
+    if (U != Cond && U != IncV) return false;
 
-  for (Value::use_iterator UI = IncV->use_begin(), UE = IncV->use_end();
-       UI != UE; ++UI) {
-    if (*UI != Cond && *UI != Phi) return false;
-  }
+  for (User *U : IncV->users())
+    if (U != Cond && U != Phi) return false;
   return true;
 }
 
@@ -1386,7 +1439,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
 /// could at least handle constant BECounts.
 static PHINode *
 FindLoopCounter(Loop *L, const SCEV *BECount,
-                ScalarEvolution *SE, DominatorTree *DT, const DataLayout *TD) {
+                ScalarEvolution *SE, DominatorTree *DT, const DataLayout *DL) {
   uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
 
   Value *Cond =
@@ -1415,7 +1468,7 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
     // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
     // AR may not be a narrower type, or we may never exit.
     uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
-    if (PhiWidth < BCWidth || (TD && !TD->isLegalInteger(PhiWidth)))
+    if (PhiWidth < BCWidth || (DL && !DL->isLegalInteger(PhiWidth)))
       continue;
 
     const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
@@ -1697,13 +1750,12 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
     // Determine if there is a use in or before the loop (direct or
     // otherwise).
     bool UsedInLoop = false;
-    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
-         UI != UE; ++UI) {
-      User *U = *UI;
-      BasicBlock *UseBB = cast<Instruction>(U)->getParent();
-      if (PHINode *P = dyn_cast<PHINode>(U)) {
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
+      BasicBlock *UseBB = User->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(User)) {
         unsigned i =
-          PHINode::getIncomingValueNumForOperand(UI.getOperandNo());
+          PHINode::getIncomingValueNumForOperand(U.getOperandNo());
         UseBB = P->getIncomingBlock(i);
       }
       if (UseBB == Preheader || L->contains(UseBB)) {
@@ -1743,6 +1795,9 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
 //===----------------------------------------------------------------------===//
 
 bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
   // If LoopSimplify form is not available, stay out of trouble. Some notes:
   //  - LSR currently only supports LoopSimplify-form loops. Indvars'
   //    canonicalization can be a pessimization without LSR to "clean up"
@@ -1756,8 +1811,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
-  DT = &getAnalysis<DominatorTree>();
-  TD = getAnalysisIfAvailable<DataLayout>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
   TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   DeadInsts.clear();
@@ -1799,13 +1855,13 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
   if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) {
-    PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD);
+    PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, DL);
     if (IndVar) {
       // Check preconditions for proper SCEVExpander operation. SCEV does not
       // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
       // pass that uses the SCEVExpander must do it. This does not work well for
-      // loop passes because SCEVExpander makes assumptions about all loops, while
-      // LoopPassManager only forces the current loop to be simplified.
+      // loop passes because SCEVExpander makes assumptions about all loops,
+      // while LoopPassManager only forces the current loop to be simplified.
       //
       // FIXME: SCEV expansion has no way to bail out, so the caller must
       // explicitly check any assumptions made by SCEV. Brittle.
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index b3ec2fc..067deb7 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -27,10 +27,10 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -76,7 +76,7 @@ namespace {
   /// revectored to the false side of the second if.
   ///
   class JumpThreading : public FunctionPass {
-    DataLayout *TD;
+    const DataLayout *DL;
     TargetLibraryInfo *TLI;
     LazyValueInfo *LVI;
 #ifdef NDEBUG
@@ -105,9 +105,9 @@ namespace {
       initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LazyValueInfo>();
       AU.addPreserved<LazyValueInfo>();
       AU.addRequired<TargetLibraryInfo>();
@@ -148,8 +148,12 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
 /// runOnFunction - Top level algorithm.
 ///
 bool JumpThreading::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
-  TD = getAnalysisIfAvailable<DataLayout>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
   TLI = &getAnalysis<TargetLibraryInfo>();
   LVI = &getAnalysis<LazyValueInfo>();
 
@@ -251,7 +255,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
     // as having cost of 2 total, and if they are a vector intrinsic, we model
     // them as having cost 1.
     if (const CallInst *CI = dyn_cast<CallInst>(I)) {
-      if (CI->hasFnAttr(Attribute::NoDuplicate))
+      if (CI->cannotDuplicate())
         // Blocks with NoDuplicate are modelled as having infinite cost, so they
         // are never duplicated.
         return ~0U;
@@ -490,7 +494,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
         Value *LHS = PN->getIncomingValue(i);
         Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
 
-        Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, TD);
+        Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, DL);
         if (Res == 0) {
           if (!isa<Constant>(RHS))
             continue;
@@ -692,7 +696,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   // Run constant folding to see if we can reduce the condition to a simple
   // constant.
   if (Instruction *I = dyn_cast<Instruction>(Condition)) {
-    Value *SimpleVal = ConstantFoldInstruction(I, TD, TLI);
+    Value *SimpleVal = ConstantFoldInstruction(I, DL, TLI);
     if (SimpleVal) {
       I->replaceAllUsesWith(SimpleVal);
       I->eraseFromParent();
@@ -1431,16 +1435,15 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
     // Scan all uses of this instruction to see if it is used outside of its
     // block, and if so, record them in UsesToRename.
-    for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
-         ++UI) {
-      Instruction *User = cast<Instruction>(*UI);
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
       if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
-        if (UserPN->getIncomingBlock(UI) == BB)
+        if (UserPN->getIncomingBlock(U) == BB)
           continue;
       } else if (User->getParent() == BB)
         continue;
 
-      UsesToRename.push_back(&UI.getUse());
+      UsesToRename.push_back(&U);
     }
 
     // If there are no uses outside the block, we're done with this instruction.
@@ -1475,7 +1478,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   // At this point, the IR is fully up to date and consistent.  Do a quick scan
   // over the new instructions and zap any that are constants or dead.  This
   // frequently happens because of phi translation.
-  SimplifyInstructionsInBlock(NewBB, TD, TLI);
+  SimplifyInstructionsInBlock(NewBB, DL, TLI);
 
   // Threaded an edge!
   ++NumThreads;
@@ -1557,7 +1560,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
     // If this instruction can be simplified after the operands are updated,
     // just use the simplified value instead.  This frequently happens due to
     // phi translation.
-    if (Value *IV = SimplifyInstruction(New, TD)) {
+    if (Value *IV = SimplifyInstruction(New, DL)) {
       delete New;
       ValueMapping[BI] = IV;
     } else {
@@ -1585,16 +1588,15 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
     // Scan all uses of this instruction to see if it is used outside of its
     // block, and if so, record them in UsesToRename.
-    for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
-         ++UI) {
-      Instruction *User = cast<Instruction>(*UI);
+    for (Use &U : I->uses()) {
+      Instruction *User = cast<Instruction>(U.getUser());
       if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
-        if (UserPN->getIncomingBlock(UI) == BB)
+        if (UserPN->getIncomingBlock(U) == BB)
           continue;
       } else if (User->getParent() == BB)
         continue;
 
-      UsesToRename.push_back(&UI.getUse());
+      UsesToRename.push_back(&U);
     }
 
     // If there are no uses outside the block, we're done with this instruction.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index f94cd2a..b69f2dc 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -36,23 +36,26 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/Support/CFG.h"
+#include "llvm/IR/PredIteratorCache.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 using namespace llvm;
@@ -74,26 +77,28 @@ namespace {
       initializeLICMPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
     /// This transformation requires natural loop information & requires that
     /// loop preheaders be inserted into the CFG...
     ///
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfo>();
       AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
       AU.addRequired<AliasAnalysis>();
       AU.addPreserved<AliasAnalysis>();
-      AU.addPreserved("scalar-evolution");
-      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreserved<ScalarEvolution>();
       AU.addRequired<TargetLibraryInfo>();
     }
 
     using llvm::Pass::doFinalization;
 
-    bool doFinalization() {
+    bool doFinalization() override {
       assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets");
       return false;
     }
@@ -103,7 +108,7 @@ namespace {
     LoopInfo      *LI;       // Current LoopInfo
     DominatorTree *DT;       // Dominator Tree for the current Loop.
 
-    DataLayout *TD;          // DataLayout for constant folding.
+    const DataLayout *DL;    // DataLayout for constant folding.
     TargetLibraryInfo *TLI;  // TargetLibraryInfo for constant folding.
 
     // State that is updated as we process loops.
@@ -117,11 +122,12 @@ namespace {
     DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
 
     /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
-    void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L);
+    void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
+                                 Loop *L) override;
 
     /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
     /// set.
-    void deleteAnalysisValue(Value *V, Loop *L);
+    void deleteAnalysisValue(Value *V, Loop *L) override;
 
     /// SinkRegion - Walk the specified region of the CFG (defined by all blocks
     /// dominated by the specified block, and that are in the current loop) in
@@ -183,15 +189,18 @@ namespace {
 
     void PromoteAliasSet(AliasSet &AS,
                          SmallVectorImpl<BasicBlock*> &ExitBlocks,
-                         SmallVectorImpl<Instruction*> &InsertPts);
+                         SmallVectorImpl<Instruction*> &InsertPts,
+                         PredIteratorCache &PIC);
   };
 }
 
 char LICM::ID = 0;
 INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)
@@ -203,16 +212,22 @@ Pass *llvm::createLICMPass() { return new LICM(); }
 /// times on one loop.
 ///
 bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
   Changed = false;
 
   // Get our Loop and Alias Analysis information...
   LI = &getAnalysis<LoopInfo>();
   AA = &getAnalysis<AliasAnalysis>();
-  DT = &getAnalysis<DominatorTree>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
-  TD = getAnalysisIfAvailable<DataLayout>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
   TLI = &getAnalysis<TargetLibraryInfo>();
 
+  assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+
   CurAST = new AliasSetTracker(*AA);
   // Collect Alias info from subloops.
   for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end();
@@ -272,16 +287,33 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
-  if (!DisablePromotion && Preheader && L->hasDedicatedExits()) {
+  if (!DisablePromotion && (Preheader || L->hasDedicatedExits())) {
     SmallVector<BasicBlock *, 8> ExitBlocks;
     SmallVector<Instruction *, 8> InsertPts;
+    PredIteratorCache PIC;
 
     // Loop over all of the alias sets in the tracker object.
     for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
          I != E; ++I)
-      PromoteAliasSet(*I, ExitBlocks, InsertPts);
+      PromoteAliasSet(*I, ExitBlocks, InsertPts, PIC);
+
+    // Once we have promoted values across the loop body we have to recursively
+    // reform LCSSA as any nested loop may now have values defined within the
+    // loop used in the outer loop.
+    // FIXME: This is really heavy handed. It would be a bit better to use an
+    // SSAUpdater strategy during promotion that was LCSSA aware and reformed
+    // it as it went.
+    if (Changed)
+      formLCSSARecursively(*L, *DT, getAnalysisIfAvailable<ScalarEvolution>());
   }
 
+  // Check that neither this loop nor its parent have had LCSSA broken. LICM is
+  // specifically moving instructions across the loop boundary and so it is
+  // especially in need of sanity checking here.
+  assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
+  assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) &&
+         "Parent loop not left in LCSSA form after LICM!");
+
   // Clear out loops state information for the next iteration
   CurLoop = 0;
   Preheader = 0;
@@ -364,7 +396,7 @@ void LICM::HoistRegion(DomTreeNode *N) {
       // Try constant folding this instruction.  If all the operands are
       // constants, it is technically hoistable, but it would be better to just
       // fold it.
-      if (Constant *C = ConstantFoldInstruction(&I, TD, TLI)) {
+      if (Constant *C = ConstantFoldInstruction(&I, DL, TLI)) {
         DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
         CurAST->copyValue(&I, C);
         CurAST->deleteValue(&I);
@@ -450,27 +482,54 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
   return isSafeToExecuteUnconditionally(I);
 }
 
+/// \brief Returns true if a PHINode is a trivially replaceable with an
+/// Instruction.
+///
+/// This is true when all incoming values are that instruction. This pattern
+/// occurs most often with LCSSA PHI nodes.
+static bool isTriviallyReplacablePHI(PHINode &PN, Instruction &I) {
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+    if (PN.getIncomingValue(i) != &I)
+      return false;
+
+  return true;
+}
+
 /// isNotUsedInLoop - Return true if the only users of this instruction are
 /// outside of the loop.  If this is true, we can sink the instruction to the
 /// exit blocks of the loop.
 ///
 bool LICM::isNotUsedInLoop(Instruction &I) {
-  for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
-    if (PHINode *PN = dyn_cast<PHINode>(User)) {
-      // PHI node uses occur in predecessor blocks!
+  for (User *U : I.users()) {
+    Instruction *UI = cast<Instruction>(U);
+    if (PHINode *PN = dyn_cast<PHINode>(UI)) {
+      // A PHI node where all of the incoming values are this instruction are
+      // special -- they can just be RAUW'ed with the instruction and thus
+      // don't require a use in the predecessor. This is a particular important
+      // special case because it is the pattern found in LCSSA form.
+      if (isTriviallyReplacablePHI(*PN, I)) {
+        if (CurLoop->contains(PN))
+          return false;
+        else
+          continue;
+      }
+
+      // Otherwise, PHI node uses occur in predecessor blocks if the incoming
+      // values. Check for such a use being inside the loop.
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
         if (PN->getIncomingValue(i) == &I)
           if (CurLoop->contains(PN->getIncomingBlock(i)))
             return false;
-    } else if (CurLoop->contains(User)) {
-      return false;
+
+      continue;
     }
+
+    if (CurLoop->contains(UI))
+      return false;
   }
   return true;
 }
 
-
 /// sink - When an instruction is found to only be used outside of the loop,
 /// this function moves it to the exit blocks and patches up SSA form as needed.
 /// This method is guaranteed to remove the original instruction from its
@@ -479,119 +538,59 @@ bool LICM::isNotUsedInLoop(Instruction &I) {
 void LICM::sink(Instruction &I) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
 
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  CurLoop->getUniqueExitBlocks(ExitBlocks);
-
   if (isa<LoadInst>(I)) ++NumMovedLoads;
   else if (isa<CallInst>(I)) ++NumMovedCalls;
   ++NumSunk;
   Changed = true;
 
-  // The case where there is only a single exit node of this loop is common
-  // enough that we handle it as a special (more efficient) case.  It is more
-  // efficient to handle because there are no PHI nodes that need to be placed.
-  if (ExitBlocks.size() == 1) {
-    if (!DT->dominates(I.getParent(), ExitBlocks[0])) {
-      // Instruction is not used, just delete it.
-      CurAST->deleteValue(&I);
-      // If I has users in unreachable blocks, eliminate.
-      // If I is not void type then replaceAllUsesWith undef.
-      // This allows ValueHandlers and custom metadata to adjust itself.
-      if (!I.use_empty())
-        I.replaceAllUsesWith(UndefValue::get(I.getType()));
-      I.eraseFromParent();
-    } else {
-      // Move the instruction to the start of the exit block, after any PHI
-      // nodes in it.
-      I.moveBefore(ExitBlocks[0]->getFirstInsertionPt());
-
-      // This instruction is no longer in the AST for the current loop, because
-      // we just sunk it out of the loop.  If we just sunk it into an outer
-      // loop, we will rediscover the operation when we process it.
-      CurAST->deleteValue(&I);
-    }
-    return;
-  }
-
-  if (ExitBlocks.empty()) {
-    // The instruction is actually dead if there ARE NO exit blocks.
-    CurAST->deleteValue(&I);
-    // If I has users in unreachable blocks, eliminate.
-    // If I is not void type then replaceAllUsesWith undef.
-    // This allows ValueHandlers and custom metadata to adjust itself.
-    if (!I.use_empty())
-      I.replaceAllUsesWith(UndefValue::get(I.getType()));
-    I.eraseFromParent();
-    return;
-  }
-
-  // Otherwise, if we have multiple exits, use the SSAUpdater to do all of the
-  // hard work of inserting PHI nodes as necessary.
-  SmallVector<PHINode*, 8> NewPHIs;
-  SSAUpdater SSA(&NewPHIs);
-
-  if (!I.use_empty())
-    SSA.Initialize(I.getType(), I.getName());
-
-  // Insert a copy of the instruction in each exit block of the loop that is
-  // dominated by the instruction.  Each exit block is known to only be in the
-  // ExitBlocks list once.
-  BasicBlock *InstOrigBB = I.getParent();
-  unsigned NumInserted = 0;
-
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
-    BasicBlock *ExitBlock = ExitBlocks[i];
-
-    if (!DT->dominates(InstOrigBB, ExitBlock))
-      continue;
-
-    // Insert the code after the last PHI node.
-    BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
-
-    // If this is the first exit block processed, just move the original
-    // instruction, otherwise clone the original instruction and insert
-    // the copy.
-    Instruction *New;
-    if (NumInserted++ == 0) {
-      I.moveBefore(InsertPt);
-      New = &I;
-    } else {
-      New = I.clone();
-      if (!I.getName().empty())
-        New->setName(I.getName()+".le");
-      ExitBlock->getInstList().insert(InsertPt, New);
-    }
-
-    // Now that we have inserted the instruction, inform SSAUpdater.
-    if (!I.use_empty())
-      SSA.AddAvailableValue(ExitBlock, New);
-  }
-
-  // If the instruction doesn't dominate any exit blocks, it must be dead.
-  if (NumInserted == 0) {
-    CurAST->deleteValue(&I);
-    if (!I.use_empty())
-      I.replaceAllUsesWith(UndefValue::get(I.getType()));
-    I.eraseFromParent();
-    return;
-  }
+#ifndef NDEBUG
+  SmallVector<BasicBlock *, 32> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(), ExitBlocks.end());
+#endif
+
+  // If this instruction is only used outside of the loop, then all users are
+  // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+  // the instruction.
+  while (!I.use_empty()) {
+    // The user must be a PHI node.
+    PHINode *PN = cast<PHINode>(I.user_back());
+
+    BasicBlock *ExitBlock = PN->getParent();
+    assert(ExitBlockSet.count(ExitBlock) &&
+           "The LCSSA PHI is not in an exit block!");
+
+    Instruction *New = I.clone();
+    ExitBlock->getInstList().insert(ExitBlock->getFirstInsertionPt(), New);
+    if (!I.getName().empty())
+      New->setName(I.getName() + ".le");
+
+    // Build LCSSA PHI nodes for any in-loop operands. Note that this is
+    // particularly cheap because we can rip off the PHI node that we're
+    // replacing for the number and blocks of the predecessors.
+    // OPT: If this shows up in a profile, we can instead finish sinking all
+    // invariant instructions, and then walk their operands to re-establish
+    // LCSSA. That will eliminate creating PHI nodes just to nuke them when
+    // sinking bottom-up.
+    for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
+         ++OI)
+      if (Instruction *OInst = dyn_cast<Instruction>(*OI))
+        if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
+          if (!OLoop->contains(PN)) {
+            PHINode *OpPN = PHINode::Create(
+                OInst->getType(), PN->getNumIncomingValues(),
+                OInst->getName() + ".lcssa", ExitBlock->begin());
+            for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+              OpPN->addIncoming(OInst, PN->getIncomingBlock(i));
+            *OI = OpPN;
+          }
 
-  // Next, rewrite uses of the instruction, inserting PHI nodes as needed.
-  for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; ) {
-    // Grab the use before incrementing the iterator.
-    Use &U = UI.getUse();
-    // Increment the iterator before removing the use from the list.
-    ++UI;
-    SSA.RewriteUseAfterInsertions(U);
+    PN->replaceAllUsesWith(New);
+    PN->eraseFromParent();
   }
 
-  // Update CurAST for NewPHIs if I had pointer type.
-  if (I.getType()->isPointerTy())
-    for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
-      CurAST->copyValue(&I, NewPHIs[i]);
-
-  // Finally, remove the instruction from CurAST.  It is no longer in the loop.
   CurAST->deleteValue(&I);
+  I.eraseFromParent();
 }
 
 /// hoist - When an instruction is found to only use loop invariant operands
@@ -662,24 +661,42 @@ namespace {
     SmallPtrSet<Value*, 4> &PointerMustAliases;
     SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
     SmallVectorImpl<Instruction*> &LoopInsertPts;
+    PredIteratorCache &PredCache;
     AliasSetTracker &AST;
+    LoopInfo &LI;
     DebugLoc DL;
     int Alignment;
     MDNode *TBAATag;
+
+    Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        if (Loop *L = LI.getLoopFor(I->getParent()))
+          if (!L->contains(BB)) {
+            // We need to create an LCSSA PHI node for the incoming value and
+            // store that.
+            PHINode *PN = PHINode::Create(
+                I->getType(), PredCache.GetNumPreds(BB),
+                I->getName() + ".lcssa", BB->begin());
+            for (BasicBlock **PI = PredCache.GetPreds(BB); *PI; ++PI)
+              PN->addIncoming(I, *PI);
+            return PN;
+          }
+      return V;
+    }
+
   public:
-    LoopPromoter(Value *SP,
-                 const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
-                 SmallPtrSet<Value*, 4> &PMA,
-                 SmallVectorImpl<BasicBlock*> &LEB,
-                 SmallVectorImpl<Instruction*> &LIP,
-                 AliasSetTracker &ast, DebugLoc dl, int alignment,
+    LoopPromoter(Value *SP, const SmallVectorImpl<Instruction *> &Insts,
+                 SSAUpdater &S, SmallPtrSet<Value *, 4> &PMA,
+                 SmallVectorImpl<BasicBlock *> &LEB,
+                 SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
+                 AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
                  MDNode *TBAATag)
-      : LoadAndStorePromoter(Insts, S), SomePtr(SP),
-        PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP),
-        AST(ast), DL(dl), Alignment(alignment), TBAATag(TBAATag) {}
+        : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
+          LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
+          LI(li), DL(dl), Alignment(alignment), TBAATag(TBAATag) {}
 
-    virtual bool isInstInList(Instruction *I,
-                              const SmallVectorImpl<Instruction*> &) const {
+    bool isInstInList(Instruction *I,
+                      const SmallVectorImpl<Instruction*> &) const override {
       Value *Ptr;
       if (LoadInst *LI = dyn_cast<LoadInst>(I))
         Ptr = LI->getOperand(0);
@@ -688,7 +705,7 @@ namespace {
       return PointerMustAliases.count(Ptr);
     }
 
-    virtual void doExtraRewritesBeforeFinalDeletion() const {
+    void doExtraRewritesBeforeFinalDeletion() const override {
       // Insert stores after in the loop exit blocks.  Each exit block gets a
       // store of the live-out values that feed them.  Since we've already told
       // the SSA updater about the defs in the loop and the preheader
@@ -696,19 +713,21 @@ namespace {
       for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
         BasicBlock *ExitBlock = LoopExitBlocks[i];
         Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+        LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
+        Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
         Instruction *InsertPos = LoopInsertPts[i];
-        StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos);
+        StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
         NewSI->setAlignment(Alignment);
         NewSI->setDebugLoc(DL);
         if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
       }
     }
 
-    virtual void replaceLoadWithValue(LoadInst *LI, Value *V) const {
+    void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
       // Update alias analysis.
       AST.copyValue(LI, V);
     }
-    virtual void instructionDeleted(Instruction *I) const {
+    void instructionDeleted(Instruction *I) const override {
       AST.deleteValue(I);
     }
   };
@@ -721,7 +740,8 @@ namespace {
 ///
 void LICM::PromoteAliasSet(AliasSet &AS,
                            SmallVectorImpl<BasicBlock*> &ExitBlocks,
-                           SmallVectorImpl<Instruction*> &InsertPts) {
+                           SmallVectorImpl<Instruction*> &InsertPts,
+                           PredIteratorCache &PIC) {
   // We can promote this alias set if it has a store, if it is a "Must" alias
   // set, if the pointer is loop invariant, and if we are not eliminating any
   // volatile loads or stores.
@@ -769,23 +789,22 @@ void LICM::PromoteAliasSet(AliasSet &AS,
     if (SomePtr->getType() != ASIV->getType())
       return;
 
-    for (Value::use_iterator UI = ASIV->use_begin(), UE = ASIV->use_end();
-         UI != UE; ++UI) {
+    for (User *U : ASIV->users()) {
       // Ignore instructions that are outside the loop.
-      Instruction *Use = dyn_cast<Instruction>(*UI);
-      if (!Use || !CurLoop->contains(Use))
+      Instruction *UI = dyn_cast<Instruction>(U);
+      if (!UI || !CurLoop->contains(UI))
         continue;
 
       // If there is an non-load/store instruction in the loop, we can't promote
       // it.
-      if (LoadInst *load = dyn_cast<LoadInst>(Use)) {
+      if (LoadInst *load = dyn_cast<LoadInst>(UI)) {
         assert(!load->isVolatile() && "AST broken");
         if (!load->isSimple())
           return;
-      } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) {
+      } else if (StoreInst *store = dyn_cast<StoreInst>(UI)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
-        if (Use->getOperand(1) != ASIV)
+        if (UI->getOperand(1) != ASIV)
           continue;
         assert(!store->isVolatile() && "AST broken");
         if (!store->isSimple())
@@ -801,13 +820,13 @@ void LICM::PromoteAliasSet(AliasSet &AS,
         // Larger is better, with the exception of 0 being the best alignment.
         unsigned InstAlignment = store->getAlignment();
         if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0)
-          if (isGuaranteedToExecute(*Use)) {
+          if (isGuaranteedToExecute(*UI)) {
             GuaranteedToExecute = true;
             Alignment = InstAlignment;
           }
 
         if (!GuaranteedToExecute)
-          GuaranteedToExecute = isGuaranteedToExecute(*Use);
+          GuaranteedToExecute = isGuaranteedToExecute(*UI);
 
       } else
         return; // Not a load or store.
@@ -815,13 +834,13 @@ void LICM::PromoteAliasSet(AliasSet &AS,
       // Merge the TBAA tags.
       if (LoopUses.empty()) {
         // On the first load/store, just take its TBAA tag.
-        TBAATag = Use->getMetadata(LLVMContext::MD_tbaa);
+        TBAATag = UI->getMetadata(LLVMContext::MD_tbaa);
       } else if (TBAATag) {
         TBAATag = MDNode::getMostGenericTBAA(TBAATag,
-                                       Use->getMetadata(LLVMContext::MD_tbaa));
+                                       UI->getMetadata(LLVMContext::MD_tbaa));
       }
-      
-      LoopUses.push_back(Use);
+
+      LoopUses.push_back(UI);
     }
   }
 
@@ -853,7 +872,7 @@ void LICM::PromoteAliasSet(AliasSet &AS,
   SmallVector<PHINode*, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, *CurAST, DL, Alignment, TBAATag);
+                        InsertPts, PIC, *CurAST, *LI, DL, Alignment, TBAATag);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt
index cee9119..1f6df7d 100644
--- a/lib/Transforms/Scalar/LLVMBuild.txt
+++ b/lib/Transforms/Scalar/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = Scalar
 parent = Transforms
 library_name = ScalarOpts
-required_libraries = Analysis Core InstCombine Support Target TransformUtils
+required_libraries = Analysis Core IPA InstCombine Support Target TransformUtils
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 9e39d2e..9a520c8 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -18,9 +18,9 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/Dominators.h"
 using namespace llvm;
 
 STATISTIC(NumDeleted, "Number of loops deleted");
@@ -34,17 +34,17 @@ namespace {
     }
 
     // Possibly eliminate loop L if it is dead.
-    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<DominatorTree>();
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfo>();
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
 
       AU.addPreserved<ScalarEvolution>();
-      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addPreserved<LoopInfo>();
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
@@ -61,7 +61,7 @@ namespace {
 char LoopDeletion::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
                 "Delete dead loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
@@ -130,6 +130,9 @@ bool LoopDeletion::isLoopDead(Loop *L,
 /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
 /// in order to make various safety checks work.
 bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
   // We can only remove the loop if there is a preheader that we can
   // branch from after removing it.
   BasicBlock *preheader = L->getLoopPreheader();
@@ -202,7 +205,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Update the dominator tree and remove the instructions and blocks that will
   // be deleted from the reference counting scheme.
-  DominatorTree &DT = getAnalysis<DominatorTree>();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   SmallVector<DomTreeNode*, 8> ChildNodes;
   for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
        LI != LE; ++LI) {
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 952b76b..e5e8b84 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -51,6 +51,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
@@ -78,9 +79,6 @@ namespace {
       return dyn_cast<BranchInst>(BB->getTerminator());
     }
 
-    /// Return the condition of the branch terminating the given basic block.
-    static Value *getBrCondtion(BasicBlock *);
-
     /// Derive the precondition block (i.e the block that guards the loop
     /// preheader) from the given preheader.
     static BasicBlock *getPrecondBb(BasicBlock *PreHead);
@@ -108,8 +106,8 @@ namespace {
     bool preliminaryScreen();
 
     /// Check if the given conditional branch is based on the comparison
-    /// beween a variable and zero, and if the variable is non-zero, the
-    /// control yeilds to the loop entry. If the branch matches the behavior,
+    /// between a variable and zero, and if the variable is non-zero, the
+    /// control yields to the loop entry. If the branch matches the behavior,
     /// the variable involved in the comparion is returned. This function will
     /// be called to see if the precondition and postcondition of the loop
     /// are in desirable form.
@@ -131,7 +129,7 @@ namespace {
 
   class LoopIdiomRecognize : public LoopPass {
     Loop *CurLoop;
-    const DataLayout *TD;
+    const DataLayout *DL;
     DominatorTree *DT;
     ScalarEvolution *SE;
     TargetLibraryInfo *TLI;
@@ -140,10 +138,10 @@ namespace {
     static char ID;
     explicit LoopIdiomRecognize() : LoopPass(ID) {
       initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
-      TD = 0; DT = 0; SE = 0; TLI = 0; TTI = 0;
+      DL = 0; DT = 0; SE = 0; TLI = 0; TTI = 0;
     }
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
     bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
                         SmallVectorImpl<BasicBlock*> &ExitBlocks);
 
@@ -163,7 +161,7 @@ namespace {
     /// This transformation requires natural loop information & requires that
     /// loop preheaders be inserted into the CFG.
     ///
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
       AU.addRequiredID(LoopSimplifyID);
@@ -174,18 +172,23 @@ namespace {
       AU.addPreserved<AliasAnalysis>();
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
-      AU.addPreserved<DominatorTree>();
-      AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<TargetLibraryInfo>();
       AU.addRequired<TargetTransformInfo>();
     }
 
     const DataLayout *getDataLayout() {
-      return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>();
+      if (DL)
+        return DL;
+      DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+      DL = DLP ? &DLP->getDataLayout() : 0;
+      return DL;
     }
 
     DominatorTree *getDominatorTree() {
-      return DT ? DT : (DT=&getAnalysis<DominatorTree>());
+      return DT ? DT
+                : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree());
     }
 
     ScalarEvolution *getScalarEvolution() {
@@ -212,7 +215,7 @@ char LoopIdiomRecognize::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
@@ -286,11 +289,6 @@ bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
   return false;
 }
 
-Value *LIRUtil::getBrCondtion(BasicBlock *BB) {
-  BranchInst *Br = getBranch(BB);
-  return Br ? Br->getCondition() : 0;
-}
-
 BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) {
   if (BasicBlock *BB = PreHead->getSinglePredecessor()) {
     BranchInst *Br = getBranch(BB);
@@ -458,9 +456,8 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
 
       // Check if the result of the instruction is live of the loop.
       bool LiveOutLoop = false;
-      for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
-             I != E;  I++) {
-        if ((cast<Instruction>(*I))->getParent() != LoopEntry) {
+      for (User *U : Inst->users()) {
+        if ((cast<Instruction>(U))->getParent() != LoopEntry) {
           LiveOutLoop = true; break;
         }
       }
@@ -519,7 +516,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
     // TripCnt is exactly the number of iterations the loop has
     TripCnt = NewCount;
 
-    // If the popoulation counter's initial value is not zero, insert Add Inst.
+    // If the population counter's initial value is not zero, insert Add Inst.
     Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
     ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
     if (!InitConst || !InitConst->isZero()) {
@@ -596,11 +593,9 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
   //  __builtin_ctpop().
   {
     SmallVector<Value *, 4> CntUses;
-    for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end();
-         I != E; I++) {
-      if (cast<Instruction>(*I)->getParent() != Body)
-        CntUses.push_back(*I);
-    }
+    for (User *U : CntInst->users())
+      if (cast<Instruction>(U)->getParent() != Body)
+        CntUses.push_back(U);
     for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) {
       (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount);
     }
@@ -705,6 +700,9 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
 }
 
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
   CurLoop = L;
 
   // If the loop could not be converted to canonical form, it must have an
@@ -777,7 +775,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
   Value *StorePtr = SI->getPointerOperand();
 
   // Reject stores that are so large that they overflow an unsigned.
-  uint64_t SizeInBits = TD->getTypeSizeInBits(StoredVal->getType());
+  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
   if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
     return false;
 
@@ -905,7 +903,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,
 ///
 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
 /// just replicate their input array and then pass on to memset_pattern16.
-static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) {
+static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) {
   // If the value isn't a constant, we can't promote it to being in a constant
   // array.  We could theoretically do a store to an alloca or something, but
   // that doesn't seem worthwhile.
@@ -913,12 +911,12 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) {
   if (C == 0) return 0;
 
   // Only handle simple values that are a power of two bytes in size.
-  uint64_t Size = TD.getTypeSizeInBits(V->getType());
+  uint64_t Size = DL.getTypeSizeInBits(V->getType());
   if (Size == 0 || (Size & 7) || (Size & (Size-1)))
     return 0;
 
   // Don't care enough about darwin/ppc to implement this.
-  if (TD.isBigEndian())
+  if (DL.isBigEndian())
     return 0;
 
   // Convert to size in bytes.
@@ -965,7 +963,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
     PatternValue = 0;
   } else if (DestAS == 0 &&
              TLI->has(LibFunc::memset_pattern16) &&
-             (PatternValue = getMemSetPatternValue(StoredVal, *TD))) {
+             (PatternValue = getMemSetPatternValue(StoredVal, *DL))) {
     // Don't create memset_pattern16s with address spaces.
     // It looks like we can use PatternValue!
     SplatValue = 0;
@@ -1006,7 +1004,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = Builder.getIntPtrTy(TD, DestAS);
+  Type *IntPtr = Builder.getIntPtrTy(DL, DestAS);
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
@@ -1120,7 +1118,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace());
+  Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace());
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1),
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index a23860a..263ba93 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -13,13 +13,13 @@
 
 #define DEBUG_TYPE "loop-instsimplify"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/Dominators.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLibraryInfo.h"
@@ -36,9 +36,9 @@ namespace {
       initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnLoop(Loop*, LPPassManager&);
+    bool runOnLoop(Loop*, LPPassManager&) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<LoopInfo>();
       AU.addRequiredID(LoopSimplifyID);
@@ -54,7 +54,7 @@ char LoopInstSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
                 "Simplify instructions in loops", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify",
@@ -65,9 +65,15 @@ Pass *llvm::createLoopInstSimplifyPass() {
 }
 
 bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
-  DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
+  if (skipOptnoneFunction(L))
+    return false;
+
+  DominatorTreeWrapperPass *DTWP =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : 0;
   LoopInfo *LI = &getAnalysis<LoopInfo>();
-  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -109,12 +115,11 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
         // Don't bother simplifying unused instructions.
         if (!I->use_empty()) {
-          Value *V = SimplifyInstruction(I, TD, TLI, DT);
+          Value *V = SimplifyInstruction(I, DL, TLI, DT);
           if (V && LI->replacementPreservesLCSSAForm(I, V)) {
             // Mark all uses for resimplification next time round the loop.
-            for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
-                 UI != UE; ++UI)
-              Next->insert(cast<Instruction>(*UI));
+            for (User *U : I->users())
+              Next->insert(cast<Instruction>(U));
 
             I->replaceAllUsesWith(V);
             LocalChanged = true;
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index 335af81..81c1e42 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -13,9 +13,9 @@
 
 #define DEBUG_TYPE "loop-reroll"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -124,14 +125,14 @@ namespace {
       initializeLoopRerollPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
-      AU.addRequired<DominatorTree>();
-      AU.addPreserved<DominatorTree>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<ScalarEvolution>();
       AU.addRequired<TargetLibraryInfo>();
     }
@@ -140,7 +141,7 @@ protected:
     AliasAnalysis *AA;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    DataLayout *DL;
+    const DataLayout *DL;
     TargetLibraryInfo *TLI;
     DominatorTree *DT;
 
@@ -189,12 +190,12 @@ protected:
 
       iterator begin() {
         assert(Valid && "Using invalid reduction");
-        return llvm::next(Instructions.begin());
+        return std::next(Instructions.begin());
       }
 
       const_iterator begin() const {
         assert(Valid && "Using invalid reduction");
-        return llvm::next(Instructions.begin());
+        return std::next(Instructions.begin());
       }
 
       iterator end() { return Instructions.end(); }
@@ -340,7 +341,7 @@ char LoopReroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
@@ -353,12 +354,9 @@ Pass *llvm::createLoopRerollPass() {
 // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
 // non-loop blocks to be outside the loop.
 static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
-  for (Value::use_iterator UI = I->use_begin(),
-       UIE = I->use_end(); UI != UIE; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
-    if (!L->contains(User))
+  for (User *U : I->users())
+    if (!L->contains(cast<Instruction>(U)))
       return true;
-  }
 
   return false;
 }
@@ -408,7 +406,7 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) {
   Instruction *C = Instructions.front();
 
   do {
-    C = cast<Instruction>(*C->use_begin());
+    C = cast<Instruction>(*C->user_begin());
     if (C->hasOneUse()) {
       if (!C->isBinaryOp())
         return;
@@ -423,17 +421,15 @@ void LoopReroll::SimpleLoopReduction::add(Loop *L) {
 
   if (Instructions.size() < 2 ||
       !C->isSameOperationAs(Instructions.back()) ||
-      C->use_begin() == C->use_end())
+      C->use_empty())
     return;
 
   // C is now the (potential) last instruction in the reduction chain.
-  for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end();
-       UI != UIE; ++UI) {
+  for (User *U : C->users())
     // The only in-loop user can be the initial PHI.
-    if (L->contains(cast<Instruction>(*UI)))
-      if (cast<Instruction>(*UI ) != Instructions.front())
+    if (L->contains(cast<Instruction>(U)))
+      if (cast<Instruction>(U) != Instructions.front())
         return;
-  }
 
   Instructions.push_back(C);
   Valid = true;
@@ -483,12 +479,11 @@ void LoopReroll::collectInLoopUserSet(Loop *L,
       continue;
 
     if (!Final.count(I))
-      for (Value::use_iterator UI = I->use_begin(),
-           UIE = I->use_end(); UI != UIE; ++UI) {
-        Instruction *User = cast<Instruction>(*UI);
+      for (Use &U : I->uses()) {
+        Instruction *User = cast<Instruction>(U.getUser());
         if (PHINode *PN = dyn_cast<PHINode>(User)) {
           // Ignore "wrap-around" uses to PHIs of this loop's header.
-          if (PN->getIncomingBlock(UI) == L->getHeader())
+          if (PN->getIncomingBlock(U) == L->getHeader())
             continue;
         }
   
@@ -559,8 +554,8 @@ bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
   if (RealIV->getNumUses() != 2)
     return false;
   const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV));
-  Instruction *User1 = cast<Instruction>(*RealIV->use_begin()),
-              *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin()));
+  Instruction *User1 = cast<Instruction>(*RealIV->user_begin()),
+              *User2 = cast<Instruction>(*std::next(RealIV->user_begin()));
   if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType()))
     return false;
   const SCEVAddRecExpr *User1SCEV =
@@ -616,26 +611,25 @@ bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale,
                                  SmallVector<SmallInstructionVector, 32> &Roots,
                                  SmallInstructionSet &AllRoots,
                                  SmallInstructionVector &LoopIncs) {
-  for (Value::use_iterator UI = IV->use_begin(),
-       UIE = IV->use_end(); UI != UIE; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
-    if (!SE->isSCEVable(User->getType()))
+  for (User *U : IV->users()) {
+    Instruction *UI = cast<Instruction>(U);
+    if (!SE->isSCEVable(UI->getType()))
       continue;
-    if (User->getType() != IV->getType())
+    if (UI->getType() != IV->getType())
       continue;
-    if (!L->contains(User))
+    if (!L->contains(UI))
       continue;
-    if (hasUsesOutsideLoop(User, L))
+    if (hasUsesOutsideLoop(UI, L))
       continue;
 
     if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV(
-          SE->getSCEV(User), SE->getSCEV(IV)))) {
+          SE->getSCEV(UI), SE->getSCEV(IV)))) {
       uint64_t Idx = Diff->getValue()->getValue().getZExtValue();
       if (Idx > 0 && Idx < Scale) {
-        Roots[Idx-1].push_back(User);
-        AllRoots.insert(User);
+        Roots[Idx-1].push_back(UI);
+        AllRoots.insert(UI);
       } else if (Idx == Scale && Inc > 1) {
-        LoopIncs.push_back(User);
+        LoopIncs.push_back(UI);
       }
     }
   }
@@ -719,10 +713,8 @@ void LoopReroll::ReductionTracker::replaceSelected() {
 
     // Replace users with the new end-of-chain value.
     SmallInstructionVector Users;
-    for (Value::use_iterator UI =
-           PossibleReds[i].getReducedValue()->use_begin(),
-         UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI)
-      Users.push_back(cast<Instruction>(*UI));
+    for (User *U : PossibleReds[i].getReducedValue()->users())
+      Users.push_back(cast<Instruction>(U));
 
     for (SmallInstructionVector::iterator J = Users.begin(),
          JE = Users.end(); J != JE; ++J)
@@ -1088,9 +1080,8 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
                            L, SCEV::FlagAnyWrap));
   { // Limit the lifetime of SCEVExpander.
     SCEVExpander Expander(*SE, "reroll");
-    PHINode *NewIV =
-      cast<PHINode>(Expander.expandCodeFor(H, IV->getType(),
-                                           Header->begin()));
+    Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin());
+
     for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(),
          JE = BaseUseSet.end(); J != JE; ++J)
       (*J)->replaceUsesOfWith(IV, NewIV);
@@ -1101,20 +1092,23 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
         if (Inc == 1)
           ICSCEV =
             SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale));
-        Value *IC;
-        if (isa<SCEVConstant>(ICSCEV)) {
-          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI);
+        // Iteration count SCEV minus 1
+        const SCEV *ICMinus1SCEV =
+          SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1));
+
+        Value *ICMinus1; // Iteration count minus 1
+        if (isa<SCEVConstant>(ICMinus1SCEV)) {
+          ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
         } else {
           BasicBlock *Preheader = L->getLoopPreheader();
           if (!Preheader)
             Preheader = InsertPreheaderForLoop(L, this);
 
-          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(),
-                                      Preheader->getTerminator());
+          ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
+                                            Preheader->getTerminator());
         }
  
-        Value *NewIVNext = NewIV->getIncomingValueForBlock(Header); 
-        Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC,
+        Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1,
                                    "exitcond");
         BI->setCondition(Cond);
 
@@ -1131,12 +1125,16 @@ bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
 }
 
 bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
   AA = &getAnalysis<AliasAnalysis>();
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
   TLI = &getAnalysis<TargetLibraryInfo>();
-  DL = getAnalysisIfAvailable<DataLayout>();
-  DT = &getAnalysis<DominatorTree>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 14c5655..fde6bac 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -20,9 +20,10 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -43,8 +44,8 @@ namespace {
     }
 
     // LCSSA form makes instruction renaming easier.
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addPreserved<DominatorTree>();
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
       AU.addRequiredID(LoopSimplifyID);
@@ -55,7 +56,7 @@ namespace {
       AU.addRequired<TargetTransformInfo>();
     }
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
     bool simplifyLoopLatch(Loop *L);
     bool rotateLoop(Loop *L, bool SimplifiedLatch);
 
@@ -78,6 +79,9 @@ Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
 /// Rotate Loop L as many times as possible. Return true if
 /// the loop is rotated at least once.
 bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
   LI = &getAnalysis<LoopInfo>();
   TTI = &getAnalysis<TargetTransformInfo>();
 
@@ -130,7 +134,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
     for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
          UE = OrigHeaderVal->use_end(); UI != UE; ) {
       // Grab the use before incrementing the iterator.
-      Use &U = UI.getUse();
+      Use &U = *UI;
 
       // Increment the iterator before removing the use from the list.
       ++UI;
@@ -251,8 +255,9 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
   // Nuke the Latch block.
   assert(Latch->empty() && "unable to evacuate Latch");
   LI->removeBlock(Latch);
-  if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
-    DT->eraseNode(Latch);
+  if (DominatorTreeWrapperPass *DTWP =
+          getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+    DTWP->getDomTree().eraseNode(Latch);
   Latch->eraseFromParent();
   return true;
 }
@@ -301,7 +306,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     CodeMetrics Metrics;
     Metrics.analyzeBasicBlock(OrigHeader, *TTI);
     if (Metrics.notDuplicatable) {
-      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non duplicatable"
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
             << " instructions: "; L->dump());
       return false;
     }
@@ -433,23 +438,25 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // The conditional branch can't be folded, handle the general case.
     // Update DominatorTree to reflect the CFG change we just made.  Then split
     // edges as necessary to preserve LoopSimplify form.
-    if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
+    if (DominatorTreeWrapperPass *DTWP =
+            getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
+      DominatorTree &DT = DTWP->getDomTree();
       // Everything that was dominated by the old loop header is now dominated
       // by the original loop preheader. Conceptually the header was merged
       // into the preheader, even though we reuse the actual block as a new
       // loop latch.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader);
       SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
                                                    OrigHeaderNode->end());
-      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+      DomTreeNode *OrigPreheaderNode = DT.getNode(OrigPreheader);
       for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
-        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+        DT.changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
 
-      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
-      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
+      assert(DT.getNode(Exit)->getIDom() == OrigPreheaderNode);
+      assert(DT.getNode(NewHeader)->getIDom() == OrigPreheaderNode);
 
       // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+      DT.changeImmediateDominator(OrigHeader, OrigLatch);
     }
 
     // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
@@ -459,9 +466,24 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     NewPH->setName(NewHeader->getName() + ".lr.ph");
 
     // Preserve canonical loop form, which means that 'Exit' should have only
-    // one predecessor.
-    BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this);
-    ExitSplit->moveBefore(Exit);
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+    bool SplitLatchEdge = false;
+    for (SmallVectorImpl<BasicBlock *>::iterator PI = ExitPreds.begin(),
+                                                 PE = ExitPreds.end();
+         PI != PE; ++PI) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(*PI);
+      if (!PredLoop || PredLoop->contains(Exit))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == *PI;
+      BasicBlock *ExitSplit = SplitCriticalEdge(*PI, Exit, this);
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
   } else {
     // We can fold the conditional branch in the preheader, this makes things
     // simpler. The first step is to remove the extra edge to the Exit block.
@@ -471,15 +493,17 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     PHBI->eraseFromParent();
 
     // With our CFG finalized, update DomTree if it is available.
-    if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
+    if (DominatorTreeWrapperPass *DTWP =
+            getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
+      DominatorTree &DT = DTWP->getDomTree();
       // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+      DT.changeImmediateDominator(NewHeader, OrigPreheader);
+      DT.changeImmediateDominator(OrigHeader, OrigLatch);
 
       // Brute force incremental dominator tree update. Call
       // findNearestCommonDominator on all CFG predecessors of each child of the
       // original header.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      DomTreeNode *OrigHeaderNode = DT.getNode(OrigHeader);
       SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
                                                    OrigHeaderNode->end());
       bool Changed;
@@ -492,11 +516,11 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
           pred_iterator PI = pred_begin(BB);
           BasicBlock *NearestDom = *PI;
           for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
-            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+            NearestDom = DT.findNearestCommonDominator(NearestDom, *PI);
 
           // Remember if this changes the DomTree.
           if (Node->getIDom()->getBlock() != NearestDom) {
-            DT->changeImmediateDominator(BB, NearestDom);
+            DT.changeImmediateDominator(BB, NearestDom);
             Changed = true;
           }
         }
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index eff5268..272a16d 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -56,22 +56,22 @@
 #define DEBUG_TYPE "loop-reduce"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Assembly/Writer.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -394,7 +394,7 @@ void Formula::print(raw_ostream &OS) const {
   bool First = true;
   if (BaseGV) {
     if (!First) OS << " + "; else First = false;
-    WriteAsOperand(OS, BaseGV, /*PrintType=*/false);
+    BaseGV->printAsOperand(OS, /*PrintType=*/false);
   }
   if (BaseOffset != 0) {
     if (!First) OS << " + "; else First = false;
@@ -422,7 +422,7 @@ void Formula::print(raw_ostream &OS) const {
     OS << ')';
   }
   if (UnfoldedOffset != 0) {
-    if (!First) OS << " + "; else First = false;
+    if (!First) OS << " + ";
     OS << "imm(" << UnfoldedOffset << ')';
   }
 }
@@ -723,13 +723,12 @@ static bool isHighCostExpansion(const SCEV *S,
       // multiplication already generates this expression.
       if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
         Value *UVal = U->getValue();
-        for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end();
-             UI != UE; ++UI) {
+        for (User *UR : UVal->users()) {
           // If U is a constant, it may be used by a ConstantExpr.
-          Instruction *User = dyn_cast<Instruction>(*UI);
-          if (User && User->getOpcode() == Instruction::Mul
-              && SE.isSCEVable(User->getType())) {
-            return SE.getSCEV(User) == Mul;
+          Instruction *UI = dyn_cast<Instruction>(UR);
+          if (UI && UI->getOpcode() == Instruction::Mul &&
+              SE.isSCEVable(UI->getType())) {
+            return SE.getSCEV(UI) == Mul;
           }
         }
       }
@@ -804,7 +803,7 @@ public:
 
   bool operator<(const Cost &Other) const;
 
-  void Loose();
+  void Lose();
 
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
@@ -864,7 +863,7 @@ void Cost::RateRegister(const SCEV *Reg,
         return;
 
       // Otherwise, do not consider this formula at all.
-      Loose();
+      Lose();
       return;
     }
     AddRecCost += 1; /// TODO: This should be a function of the stride.
@@ -903,7 +902,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
                                ScalarEvolution &SE, DominatorTree &DT,
                                SmallPtrSet<const SCEV *, 16> *LoserRegs) {
   if (LoserRegs && LoserRegs->count(Reg)) {
-    Loose();
+    Lose();
     return;
   }
   if (Regs.insert(Reg)) {
@@ -925,7 +924,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
-      Loose();
+      Lose();
       return;
     }
     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
@@ -936,7 +935,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
        E = F.BaseRegs.end(); I != E; ++I) {
     const SCEV *BaseReg = *I;
     if (VisitedRegs.count(BaseReg)) {
-      Loose();
+      Lose();
       return;
     }
     RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
@@ -967,8 +966,8 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
   assert(isValid() && "invalid cost");
 }
 
-/// Loose - Set this cost to a losing value.
-void Cost::Loose() {
+/// Lose - Set this cost to a losing value.
+void Cost::Lose() {
   NumRegs = ~0u;
   AddRecCost = ~0u;
   NumIVMuls = ~0u;
@@ -980,21 +979,11 @@ void Cost::Loose() {
 
 /// operator< - Choose the lower cost.
 bool Cost::operator<(const Cost &Other) const {
-  if (NumRegs != Other.NumRegs)
-    return NumRegs < Other.NumRegs;
-  if (AddRecCost != Other.AddRecCost)
-    return AddRecCost < Other.AddRecCost;
-  if (NumIVMuls != Other.NumIVMuls)
-    return NumIVMuls < Other.NumIVMuls;
-  if (NumBaseAdds != Other.NumBaseAdds)
-    return NumBaseAdds < Other.NumBaseAdds;
-  if (ScaleCost != Other.ScaleCost)
-    return ScaleCost < Other.ScaleCost;
-  if (ImmCost != Other.ImmCost)
-    return ImmCost < Other.ImmCost;
-  if (SetupCost != Other.SetupCost)
-    return SetupCost < Other.SetupCost;
-  return false;
+  return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
+                  ImmCost, SetupCost) <
+         std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
+                  Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
+                  Other.SetupCost);
 }
 
 void Cost::print(raw_ostream &OS) const {
@@ -1080,19 +1069,19 @@ void LSRFixup::print(raw_ostream &OS) const {
   // Store is common and interesting enough to be worth special-casing.
   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
     OS << "store ";
-    WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
+    Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
   } else if (UserInst->getType()->isVoidTy())
     OS << UserInst->getOpcodeName();
   else
-    WriteAsOperand(OS, UserInst, /*PrintType=*/false);
+    UserInst->printAsOperand(OS, /*PrintType=*/false);
 
   OS << ", OperandValToReplace=";
-  WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
+  OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
 
   for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
        E = PostIncLoops.end(); I != E; ++I) {
     OS << ", PostIncLoop=";
-    WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);
+    (*I)->getHeader()->printAsOperand(OS, /*PrintType=*/false);
   }
 
   if (LUIdx != ~size_t(0))
@@ -1126,11 +1115,7 @@ struct UniquifierDenseMapInfo {
   }
 
   static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
-    unsigned Result = 0;
-    for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
-         E = V.end(); I != E; ++I)
-      Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I);
-    return Result;
+    return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
   }
 
   static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
@@ -1158,6 +1143,8 @@ public:
     // TODO: Add a generic icmp too?
   };
 
+  typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
+
   KindType Kind;
   Type *AccessTy;
 
@@ -1295,7 +1282,7 @@ void LSRUse::print(raw_ostream &OS) const {
   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
        E = Offsets.end(); I != E; ++I) {
     OS << *I;
-    if (llvm::next(I) != E)
+    if (std::next(I) != E)
       OS << ',';
   }
   OS << '}';
@@ -1504,30 +1491,6 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
 
 namespace {
 
-/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
-/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
-struct UseMapDenseMapInfo {
-  static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
-    return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
-  }
-
-  static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
-    return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
-  }
-
-  static unsigned
-  getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
-    unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
-    Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
-    return Result;
-  }
-
-  static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
-                      const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
-    return LHS == RHS;
-  }
-};
-
 /// IVInc - An individual increment in a Chain of IV increments.
 /// Relate an IV user to an expression that computes the IV it uses from the IV
 /// used by the previous link in the Chain.
@@ -1562,7 +1525,7 @@ struct IVChain {
   // begin - return the first increment in the chain.
   const_iterator begin() const {
     assert(!Incs.empty());
-    return llvm::next(Incs.begin());
+    return std::next(Incs.begin());
   }
   const_iterator end() const {
     return Incs.end();
@@ -1656,9 +1619,7 @@ class LSRInstance {
   }
 
   // Support for sharing of LSRUses between LSRFixups.
-  typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
-                   size_t,
-                   UseMapDenseMapInfo> UseMapTy;
+  typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
   UseMapTy UseMap;
 
   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
@@ -2229,7 +2190,7 @@ LSRInstance::getUse(const SCEV *&Expr,
   }
 
   std::pair<UseMapTy::iterator, bool> P =
-    UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
+    UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
   if (!P.second) {
     // A use already existed with this base.
     size_t LUIdx = P.first->second;
@@ -2338,7 +2299,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
   for (SmallSetVector<const SCEV *, 4>::const_iterator
        I = Strides.begin(), E = Strides.end(); I != E; ++I)
     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
-         llvm::next(I); NewStrideIter != E; ++NewStrideIter) {
+         std::next(I); NewStrideIter != E; ++NewStrideIter) {
       const SCEV *OldStride = *I;
       const SCEV *NewStride = *NewStrideIter;
 
@@ -2646,9 +2607,8 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
   // they will eventually be used be the current chain, or can be computed
   // from one of the chain increments. To be more precise we could
   // transitively follow its user and only add leaf IV users to the set.
-  for (Value::use_iterator UseIter = IVOper->use_begin(),
-         UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) {
-    Instruction *OtherUse = dyn_cast<Instruction>(*UseIter);
+  for (User *U : IVOper->users()) {
+    Instruction *OtherUse = dyn_cast<Instruction>(U);
     if (!OtherUse)
       continue;
     // Uses in the chain will no longer be uses if the chain is formed.
@@ -2738,7 +2698,7 @@ void LSRInstance::CollectChains() {
         Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
         if (UniqueOperands.insert(IVOpInst))
           ChainInstruction(I, IVOpInst, ChainUsersVec);
-        IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE);
+        IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
       }
     } // Continue walking down the instructions.
   } // Continue walking down the domtree.
@@ -2829,7 +2789,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
         || SE.getSCEV(IVSrc) == Head.IncExpr) {
       break;
     }
-    IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE);
+    IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
   }
   if (IVOpIter == IVOpEnd) {
     // Gracefully give up on this chain.
@@ -3059,18 +3019,17 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
       Worklist.push_back(D->getLHS());
       Worklist.push_back(D->getRHS());
-    } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
-      if (!Inserted.insert(U)) continue;
-      const Value *V = U->getValue();
+    } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
+      if (!Inserted.insert(US)) continue;
+      const Value *V = US->getValue();
       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
         // Look for instructions defined outside the loop.
         if (L->contains(Inst)) continue;
       } else if (isa<UndefValue>(V))
         // Undef doesn't have a live range, so it doesn't matter.
         continue;
-      for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
-           UI != UE; ++UI) {
-        const Instruction *UserInst = dyn_cast<Instruction>(*UI);
+      for (const Use &U : V->uses()) {
+        const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
         // Ignore non-instructions.
         if (!UserInst)
           continue;
@@ -3082,7 +3041,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
           UserInst->getParent() :
           cast<PHINode>(UserInst)->getIncomingBlock(
-            PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
+            PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
         if (!DT.dominates(L->getHeader(), UseBB))
           continue;
         // Ignore uses which are part of other SCEV expressions, to avoid
@@ -3092,7 +3051,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
           // If the user is a no-op, look through to its uses.
           if (!isa<SCEVUnknown>(UserS))
             continue;
-          if (UserS == U) {
+          if (UserS == US) {
             Worklist.push_back(
               SE.getUnknown(const_cast<Instruction *>(UserInst)));
             continue;
@@ -3100,7 +3059,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
         }
         // Ignore icmp instructions which are already being analyzed.
         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
-          unsigned OtherIdx = !UI.getOperandNo();
+          unsigned OtherIdx = !U.getOperandNo();
           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
             continue;
@@ -3108,7 +3067,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
 
         LSRFixup &LF = getNewFixup();
         LF.UserInst = const_cast<Instruction *>(UserInst);
-        LF.OperandValToReplace = UI.getUse();
+        LF.OperandValToReplace = U;
         std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
         LF.LUIdx = P.first;
         LF.Offset = P.second;
@@ -3118,7 +3077,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
             SE.getTypeSizeInBits(LU.WidestFixupType) <
             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
           LU.WidestFixupType = LF.OperandValToReplace->getType();
-        InsertSupplementalFormula(U, LU, LF.LUIdx);
+        InsertSupplementalFormula(US, LU, LF.LUIdx);
         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
         break;
       }
@@ -3221,10 +3180,10 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
         continue;
 
       // Collect all operands except *J.
-      SmallVector<const SCEV *, 8> InnerAddOps
-        (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
-      InnerAddOps.append
-        (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+      SmallVector<const SCEV *, 8> InnerAddOps(
+          ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+      InnerAddOps.append(std::next(J),
+                         ((const SmallVector<const SCEV *, 8> &)AddOps).end());
 
       // Don't leave just a constant behind in a register if the constant could
       // be folded into an immediate field.
@@ -3390,6 +3349,10 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
     if (NewBaseOffset / Factor != Base.BaseOffset)
       continue;
+    // If the offset will be truncated at this use, check that it is in bounds.
+    if (!IntTy->isPointerTy() &&
+        !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
+      continue;
 
     // Check that multiplying with the use offset doesn't overflow.
     int64_t Offset = LU.MinOffset;
@@ -3398,6 +3361,10 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
     Offset = (uint64_t)Offset * Factor;
     if (Offset / Factor != LU.MinOffset)
       continue;
+    // If the offset will be truncated at this use, check that it is in bounds.
+    if (!IntTy->isPointerTy() &&
+        !ConstantInt::isValueValidForType(IntTy, Offset))
+      continue;
 
     Formula F = Base;
     F.BaseOffset = NewBaseOffset;
@@ -3432,6 +3399,10 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
         continue;
+      // If the offset will be truncated, check that it is in bounds.
+      if (!IntTy->isPointerTy() &&
+          !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
+        continue;
     }
 
     // If we make it here and it's legal, add it.
@@ -3614,8 +3585,9 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       // Conservatively examine offsets between this orig reg a few selected
       // other orig regs.
       ImmMapTy::const_iterator OtherImms[] = {
-        Imms.begin(), prior(Imms.end()),
-        Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
+        Imms.begin(), std::prev(Imms.end()),
+        Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
+                         2)
       };
       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
         ImmMapTy::const_iterator M = OtherImms[i];
@@ -4210,7 +4182,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   SmallVector<const Formula *, 8> Workspace;
   Cost SolutionCost;
-  SolutionCost.Loose();
+  SolutionCost.Lose();
   Cost CurCost;
   SmallPtrSet<const SCEV *, 16> CurRegs;
   DenseSet<const SCEV *> VisitedRegs;
@@ -4281,7 +4253,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
       // instead of at the end, so that it can be used for other expansions.
       if (IDom == Inst->getParent() &&
           (!BetterPos || !DT.dominates(Inst, BetterPos)))
-        BetterPos = llvm::next(BasicBlock::iterator(Inst));
+        BetterPos = std::next(BasicBlock::iterator(Inst));
     }
     if (!AllDominate)
       break;
@@ -4695,7 +4667,8 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
 
 LSRInstance::LSRInstance(Loop *L, Pass *P)
     : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()),
-      DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()),
+      DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()),
+      LI(P->getAnalysis<LoopInfo>()),
       TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false),
       IVIncInsertPos(0) {
   // If LoopSimplify form is not available, stay out of trouble.
@@ -4734,7 +4707,7 @@ LSRInstance::LSRInstance(Loop *L, Pass *P)
 #endif // DEBUG
 
   DEBUG(dbgs() << "\nLSR on loop ";
-        WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
+        L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
         dbgs() << ":\n");
 
   // First, perform some low-level loop optimizations.
@@ -4864,8 +4837,8 @@ public:
   LoopStrengthReduce();
 
 private:
-  bool runOnLoop(Loop *L, LPPassManager &LPM);
-  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
 }
@@ -4874,7 +4847,7 @@ char LoopStrengthReduce::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
                 "Loop Strength Reduction", false, false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(IVUsers)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
@@ -4899,8 +4872,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<LoopInfo>();
   AU.addPreserved<LoopInfo>();
   AU.addRequiredID(LoopSimplifyID);
-  AU.addRequired<DominatorTree>();
-  AU.addPreserved<DominatorTree>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
   AU.addRequired<ScalarEvolution>();
   AU.addPreserved<ScalarEvolution>();
   // Requiring LoopSimplify a second time here prevents IVUsers from running
@@ -4912,6 +4885,9 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
+  if (skipOptnoneFunction(L))
+    return false;
+
   bool Changed = false;
 
   // Run the main LSR transformation.
@@ -4925,10 +4901,9 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
 #ifndef NDEBUG
     Rewriter.setDebugType(DEBUG_TYPE);
 #endif
-    unsigned numFolded =
-        Rewriter.replaceCongruentIVs(L, &getAnalysis<DominatorTree>(),
-                                     DeadInsts,
-                                     &getAnalysis<TargetTransformInfo>());
+    unsigned numFolded = Rewriter.replaceCongruentIVs(
+        L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
+        &getAnalysis<TargetTransformInfo>());
     if (numFolded) {
       Changed = true;
       DeleteTriviallyDeadInstructions(DeadInsts);
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 08ac38d..ecd350b 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -86,12 +87,12 @@ namespace {
     bool     UserAllowPartial;     // CurrentAllowPartial is user-specified.
     bool     UserRuntime;          // CurrentRuntime is user-specified.
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 
     /// This transformation requires natural loop information & requires that
     /// loop preheaders be inserted into the CFG...
     ///
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
       AU.addRequiredID(LoopSimplifyID);
@@ -105,7 +106,7 @@ namespace {
       // If loop unroll does not preserve dom info then LCSSA pass on next
       // loop will receive invalid dom info.
       // For now, recreate dom info, if loop is unrolled.
-      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
     }
   };
 }
@@ -124,6 +125,10 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
   return new LoopUnroll(Threshold, Count, AllowPartial, Runtime);
 }
 
+Pass *llvm::createSimpleLoopUnrollPass() {
+  return llvm::createLoopUnrollPass(-1, -1, 0, 0);
+}
+
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
                                     bool &NotDuplicatable,
@@ -146,6 +151,9 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
 }
 
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+
   LoopInfo *LI = &getAnalysis<LoopInfo>();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
   const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
@@ -158,7 +166,10 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   TargetTransformInfo::UnrollingPreferences UP;
   UP.Threshold = CurrentThreshold;
   UP.OptSizeThreshold = OptSizeUnrollThreshold;
+  UP.PartialThreshold = CurrentThreshold;
+  UP.PartialOptSizeThreshold = OptSizeUnrollThreshold;
   UP.Count = CurrentCount;
+  UP.MaxCount = UINT_MAX;
   UP.Partial = CurrentAllowPartial;
   UP.Runtime = CurrentRuntime;
   TTI.getUnrollingPreferences(L, UP);
@@ -168,11 +179,15 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   // function is marked as optimize-for-size, and the unroll threshold was
   // not user specified.
   unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
+  unsigned PartialThreshold =
+    UserThreshold ? CurrentThreshold : UP.PartialThreshold;
   if (!UserThreshold &&
       Header->getParent()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex,
-                     Attribute::OptimizeForSize))
+                     Attribute::OptimizeForSize)) {
     Threshold = UP.OptSizeThreshold;
+    PartialThreshold = UP.PartialOptSizeThreshold;
+  }
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
@@ -206,14 +221,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Enforce the threshold.
-  if (Threshold != NoThreshold) {
+  if (Threshold != NoThreshold && PartialThreshold != NoThreshold) {
     unsigned NumInlineCandidates;
     bool notDuplicatable;
     unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates,
                                             notDuplicatable, TTI);
     DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
     if (notDuplicatable) {
-      DEBUG(dbgs() << "  Not unrolling loop which contains non duplicatable"
+      DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
             << " instructions.\n");
       return false;
     }
@@ -233,17 +248,19 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
       }
       if (TripCount) {
         // Reduce unroll count to be modulo of TripCount for partial unrolling
-        Count = Threshold / LoopSize;
+        Count = PartialThreshold / LoopSize;
         while (Count != 0 && TripCount%Count != 0)
           Count--;
       }
       else if (Runtime) {
         // Reduce unroll count to be a lower power-of-two value
-        while (Count != 0 && Size > Threshold) {
+        while (Count != 0 && Size > PartialThreshold) {
           Count >>= 1;
           Size = LoopSize*Count;
         }
       }
+      if (Count > UP.MaxCount)
+        Count = UP.MaxCount;
       if (Count < 2) {
         DEBUG(dbgs() << "  could not unroll partially\n");
         return false;
@@ -253,7 +270,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, this, &LPM))
     return false;
 
   return true;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index c4ebfd5..5954f4a 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -32,7 +32,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -40,6 +39,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Support/CommandLine.h"
@@ -156,27 +156,27 @@ namespace {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
       }
 
-    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
     bool processCurrentLoop();
 
     /// This transformation requires natural loop information & requires that
     /// loop preheaders be inserted into the CFG.
     ///
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
-      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addPreserved<ScalarEvolution>();
       AU.addRequired<TargetTransformInfo>();
     }
 
   private:
 
-    virtual void releaseMemory() {
+    void releaseMemory() override {
       BranchesInfo.forgetLoop(currentLoop);
     }
 
@@ -225,7 +225,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 
   LoopPropsMapIt PropsIt;
   bool Inserted;
-  llvm::tie(PropsIt, Inserted) =
+  std::tie(PropsIt, Inserted) =
       LoopsProperties.insert(std::make_pair(L, LoopProperties()));
 
   LoopProperties &Props = PropsIt->second;
@@ -382,9 +382,14 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
 }
 
 bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+  if (skipOptnoneFunction(L))
+    return false;
+
   LI = &getAnalysis<LoopInfo>();
   LPM = &LPM_Ref;
-  DT = getAnalysisIfAvailable<DominatorTree>();
+  DominatorTreeWrapperPass *DTWP =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : 0;
   currentLoop = L;
   Function *F = currentLoop->getHeader()->getParent();
   bool Changed = false;
@@ -397,7 +402,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   if (Changed) {
     // FIXME: Reconstruct dom info, because it is not preserved properly.
     if (DT)
-      DT->runOnFunction(*F);
+      DT->recalculate(*F);
   }
   return Changed;
 }
@@ -934,9 +939,8 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
       Worklist.push_back(Use);
 
   // Add users to the worklist which may be simplified now.
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-       UI != E; ++UI)
-    Worklist.push_back(cast<Instruction>(*UI));
+  for (User *U : I->users())
+    Worklist.push_back(cast<Instruction>(U));
   LPM->deleteSimpleAnalysisValue(I, L);
   RemoveFromWorklist(I, Worklist);
   I->replaceAllUsesWith(V);
@@ -986,12 +990,11 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
       Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
                                      !cast<ConstantInt>(Val)->getZExtValue());
 
-    for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
-         UI != E; ++UI) {
-      Instruction *U = dyn_cast<Instruction>(*UI);
-      if (!U || !L->contains(U))
+    for (User *U : LIC->users()) {
+      Instruction *UI = dyn_cast<Instruction>(U);
+      if (!UI || !L->contains(UI))
         continue;
-      Worklist.push_back(U);
+      Worklist.push_back(UI);
     }
 
     for (std::vector<Instruction*>::iterator UI = Worklist.begin(),
@@ -1005,19 +1008,18 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
   // Otherwise, we don't know the precise value of LIC, but we do know that it
   // is certainly NOT "Val".  As such, simplify any uses in the loop that we
   // can.  This case occurs when we unswitch switch statements.
-  for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
-       UI != E; ++UI) {
-    Instruction *U = dyn_cast<Instruction>(*UI);
-    if (!U || !L->contains(U))
+  for (User *U : LIC->users()) {
+    Instruction *UI = dyn_cast<Instruction>(U);
+    if (!UI || !L->contains(UI))
       continue;
 
-    Worklist.push_back(U);
+    Worklist.push_back(UI);
 
     // TODO: We could do other simplifications, for example, turning
     // 'icmp eq LIC, Val' -> false.
 
     // If we know that LIC is not Val, use this info to simplify code.
-    SwitchInst *SI = dyn_cast<SwitchInst>(U);
+    SwitchInst *SI = dyn_cast<SwitchInst>(UI);
     if (SI == 0 || !isa<ConstantInt>(Val)) continue;
 
     SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 8ced494..7c0a623 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -111,7 +111,9 @@ namespace {
     LowerAtomic() : BasicBlockPass(ID) {
       initializeLowerAtomicPass(*PassRegistry::getPassRegistry());
     }
-    bool runOnBasicBlock(BasicBlock &BB) {
+    bool runOnBasicBlock(BasicBlock &BB) override {
+      if (skipOptnoneFunction(BB))
+        return false;
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
         Instruction *Inst = DI++;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 9912d3d..2603c96 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -17,16 +17,16 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -75,6 +75,13 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
                             const DataLayout &TD) {
   Ptr1 = Ptr1->stripPointerCasts();
   Ptr2 = Ptr2->stripPointerCasts();
+
+  // Handle the trivial case first.
+  if (Ptr1 == Ptr2) {
+    Offset = 0;
+    return true;
+  }
+
   GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
   GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
 
@@ -195,9 +202,9 @@ class MemsetRanges {
   /// because each element is relatively large and expensive to copy.
   std::list<MemsetRange> Ranges;
   typedef std::list<MemsetRange>::iterator range_iterator;
-  const DataLayout &TD;
+  const DataLayout &DL;
 public:
-  MemsetRanges(const DataLayout &td) : TD(td) {}
+  MemsetRanges(const DataLayout &DL) : DL(DL) {}
 
   typedef std::list<MemsetRange>::const_iterator const_iterator;
   const_iterator begin() const { return Ranges.begin(); }
@@ -212,7 +219,7 @@ public:
   }
 
   void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
-    int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType());
+    int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
 
     addRange(OffsetFromFirst, StoreSize,
              SI->getPointerOperand(), SI->getAlignment(), SI);
@@ -305,23 +312,23 @@ namespace {
   class MemCpyOpt : public FunctionPass {
     MemoryDependenceAnalysis *MD;
     TargetLibraryInfo *TLI;
-    const DataLayout *TD;
+    const DataLayout *DL;
   public:
     static char ID; // Pass identification, replacement for typeid
     MemCpyOpt() : FunctionPass(ID) {
       initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
       MD = 0;
       TLI = 0;
-      TD = 0;
+      DL = 0;
     }
 
-    bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
   private:
     // This transformation requires dominator postdominator info
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
-      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<MemoryDependenceAnalysis>();
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<TargetLibraryInfo>();
@@ -353,7 +360,7 @@ FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
 
 INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
@@ -366,13 +373,13 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
 /// attempts to merge them together into a memcpy/memset.
 Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
                                              Value *StartPtr, Value *ByteVal) {
-  if (TD == 0) return 0;
+  if (DL == 0) return 0;
 
   // Okay, so we now have a single store that can be splatable.  Scan to find
   // all subsequent stores of the same value to offset from the same pointer.
   // Join these together into ranges, so we can decide whether contiguous blocks
   // are stored.
-  MemsetRanges Ranges(*TD);
+  MemsetRanges Ranges(*DL);
 
   BasicBlock::iterator BI = StartInst;
   for (++BI; !isa<TerminatorInst>(BI); ++BI) {
@@ -396,7 +403,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
       // Check to see if this store is to a constant offset from the start ptr.
       int64_t Offset;
       if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(),
-                           Offset, *TD))
+                           Offset, *DL))
         break;
 
       Ranges.addStore(Offset, NextStore);
@@ -409,7 +416,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 
       // Check to see if this store is to a constant offset from the start ptr.
       int64_t Offset;
-      if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD))
+      if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *DL))
         break;
 
       Ranges.addMemSet(Offset, MSI);
@@ -441,7 +448,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     if (Range.TheStores.size() == 1) continue;
 
     // If it is profitable to lower this range to memset, do so now.
-    if (!Range.isProfitableToUseMemset(*TD))
+    if (!Range.isProfitableToUseMemset(*DL))
       continue;
 
     // Otherwise, we do want to transform this!  Create a new memset.
@@ -453,7 +460,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     if (Alignment == 0) {
       Type *EltType =
         cast<PointerType>(StartPtr->getType())->getElementType();
-      Alignment = TD->getABITypeAlignment(EltType);
+      Alignment = DL->getABITypeAlignment(EltType);
     }
 
     AMemSet =
@@ -484,7 +491,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple()) return false;
 
-  if (TD == 0) return false;
+  if (DL == 0) return false;
 
   // Detect cases where we're performing call slot forwarding, but
   // happen to be using a load-store pair to implement it, rather than
@@ -514,15 +521,15 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       if (C) {
         unsigned storeAlign = SI->getAlignment();
         if (!storeAlign)
-          storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType());
+          storeAlign = DL->getABITypeAlignment(SI->getOperand(0)->getType());
         unsigned loadAlign = LI->getAlignment();
         if (!loadAlign)
-          loadAlign = TD->getABITypeAlignment(LI->getType());
+          loadAlign = DL->getABITypeAlignment(LI->getType());
 
         bool changed = performCallSlotOptzn(LI,
                         SI->getPointerOperand()->stripPointerCasts(),
                         LI->getPointerOperand()->stripPointerCasts(),
-                        TD->getTypeStoreSize(SI->getOperand(0)->getType()),
+                        DL->getTypeStoreSize(SI->getOperand(0)->getType()),
                         std::min(storeAlign, loadAlign), C);
         if (changed) {
           MD->removeInstruction(SI);
@@ -596,13 +603,13 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
     return false;
 
   // Check that all of src is copied to dest.
-  if (TD == 0) return false;
+  if (DL == 0) return false;
 
   ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
   if (!srcArraySize)
     return false;
 
-  uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) *
+  uint64_t srcSize = DL->getTypeAllocSize(srcAlloca->getAllocatedType()) *
     srcArraySize->getZExtValue();
 
   if (cpyLen < srcSize)
@@ -617,7 +624,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
     if (!destArraySize)
       return false;
 
-    uint64_t destSize = TD->getTypeAllocSize(A->getAllocatedType()) *
+    uint64_t destSize = DL->getTypeAllocSize(A->getAllocatedType()) *
       destArraySize->getZExtValue();
 
     if (destSize < srcSize)
@@ -636,7 +643,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
       return false;
     }
 
-    uint64_t destSize = TD->getTypeAllocSize(StructTy);
+    uint64_t destSize = DL->getTypeAllocSize(StructTy);
     if (destSize < srcSize)
       return false;
   } else {
@@ -646,7 +653,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // Check that dest points to memory that is at least as aligned as src.
   unsigned srcAlign = srcAlloca->getAlignment();
   if (!srcAlign)
-    srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType());
+    srcAlign = DL->getABITypeAlignment(srcAlloca->getAllocatedType());
   bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
   // If dest is not aligned enough and we can't increase its alignment then
   // bail out.
@@ -657,30 +664,28 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   // guarantees that it holds only undefined values when passed in (so the final
   // memcpy can be dropped), that it is not read or written between the call and
   // the memcpy, and that writing beyond the end of it is undefined.
-  SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(),
-                                   srcAlloca->use_end());
+  SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(),
+                                   srcAlloca->user_end());
   while (!srcUseList.empty()) {
-    User *UI = srcUseList.pop_back_val();
+    User *U = srcUseList.pop_back_val();
 
-    if (isa<BitCastInst>(UI)) {
-      for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
-           I != E; ++I)
-        srcUseList.push_back(*I);
-    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(UI)) {
+    if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
+      for (User *UU : U->users())
+        srcUseList.push_back(UU);
+    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
       if (G->hasAllZeroIndices())
-        for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
-             I != E; ++I)
-          srcUseList.push_back(*I);
+        for (User *UU : U->users())
+          srcUseList.push_back(UU);
       else
         return false;
-    } else if (UI != C && UI != cpy) {
+    } else if (U != C && U != cpy) {
       return false;
     }
   }
 
   // Since we're changing the parameter to the callsite, we need to make sure
   // that what would be the new parameter dominates the callsite.
-  DominatorTree &DT = getAnalysis<DominatorTree>();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
     if (!DT.dominates(cpyDestInst, C))
       return false;
@@ -816,9 +821,8 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
 /// circumstances). This allows later passes to remove the first memcpy
 /// altogether.
 bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
-  // We can only optimize statically-sized memcpy's that are non-volatile.
-  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
-  if (CopySize == 0 || M->isVolatile()) return false;
+  // We can only optimize non-volatile memcpy's.
+  if (M->isVolatile()) return false;
 
   // If the source and destination of the memcpy are the same, then zap it.
   if (M->getSource() == M->getDest()) {
@@ -832,7 +836,7 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
       if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
         IRBuilder<> Builder(M);
-        Builder.CreateMemSet(M->getRawDest(), ByteVal, CopySize,
+        Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
                              M->getAlignment(), false);
         MD->removeInstruction(M);
         M->eraseFromParent();
@@ -840,9 +844,16 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
         return true;
       }
 
-  // The are two possible optimizations we can do for memcpy:
+  // The optimizations after this point require the memcpy size.
+  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+  if (CopySize == 0) return false;
+
+  // The are three possible optimizations we can do for memcpy:
   //   a) memcpy-memcpy xform which exposes redundance for DSE.
   //   b) call-memcpy xform for return slot optimization.
+  //   c) memcpy from freshly alloca'd space or space that has just started its
+  //      lifetime copies undefined data, and we can therefore eliminate the
+  //      memcpy in favor of the data that was already at the destination.
   MemDepResult DepInfo = MD->getDependency(M);
   if (DepInfo.isClobber()) {
     if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
@@ -862,6 +873,25 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
   if (SrcDepInfo.isClobber()) {
     if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
       return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue());
+  } else if (SrcDepInfo.isDef()) {
+    Instruction *I = SrcDepInfo.getInst();
+    bool hasUndefContents = false;
+
+    if (isa<AllocaInst>(I)) {
+      hasUndefContents = true;
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+        if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+          if (LTSize->getZExtValue() >= CopySize->getZExtValue())
+            hasUndefContents = true;
+    }
+
+    if (hasUndefContents) {
+      MD->removeInstruction(M);
+      M->eraseFromParent();
+      ++NumMemCpyInstr;
+      return true;
+    }
   }
 
   return false;
@@ -899,12 +929,12 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
 
 /// processByValArgument - This is called on every byval argument in call sites.
 bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
-  if (TD == 0) return false;
+  if (DL == 0) return false;
 
   // Find out what feeds this byval argument.
   Value *ByValArg = CS.getArgument(ArgNo);
   Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
-  uint64_t ByValSize = TD->getTypeAllocSize(ByValTy);
+  uint64_t ByValSize = DL->getTypeAllocSize(ByValTy);
   MemDepResult DepInfo =
     MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize),
                                  true, CS.getInstruction(),
@@ -933,7 +963,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
   if (MDep->getAlignment() < ByValAlign &&
-      getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign)
+      getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, DL) < ByValAlign)
     return false;
 
   // Verify that the copied-from memory doesn't change in between the memcpy and
@@ -1007,9 +1037,13 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
 // function.
 //
 bool MemCpyOpt::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   bool MadeChange = false;
   MD = &getAnalysis<MemoryDependenceAnalysis>();
-  TD = getAnalysisIfAvailable<DataLayout>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
   TLI = &getAnalysis<TargetLibraryInfo>();
 
   // If we don't have at least memset and memcpy, there is little point of doing
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 15cee44..2f19935 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -35,8 +35,8 @@ namespace {
       initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool runOnFunction(Function &F);
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+    bool runOnFunction(Function &F) override;
 
   private:
     /// Optimize calls to sqrt.
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 328a9c5..b6b4d97 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -27,17 +27,16 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -67,7 +66,7 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
        << *Ops[0].Op->getType() << '\t';
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     dbgs() << "[ ";
-    WriteAsOperand(dbgs(), Ops[i].Op, false, M);
+    Ops[i].Op->printAsOperand(dbgs(), false, M);
     dbgs() << ", #" << Ops[i].Rank << "] ";
   }
 }
@@ -168,9 +167,9 @@ namespace {
       initializeReassociatePass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
     }
   private:
@@ -821,7 +820,7 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
       if (ExpressionChanged == I)
         break;
       ExpressionChanged->moveBefore(I);
-      ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->use_begin());
+      ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
     } while (1);
 
   // Throw away any left over nodes from the original expression.
@@ -863,8 +862,7 @@ static Value *NegateValue(Value *V, Instruction *BI) {
 
   // Okay, we need to materialize a negated version of V with an instruction.
   // Scan the use lists of V to see if we have one already.
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
-    User *U = *UI;
+  for (User *U : V->users()) {
     if (!BinaryOperator::isNeg(U)) continue;
 
     // We found one!  Now we have to make sure that the definition dominates
@@ -914,8 +912,8 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
       isReassociableOp(Sub->getOperand(1), Instruction::Sub))
     return true;
   if (Sub->hasOneUse() &&
-      (isReassociableOp(Sub->use_back(), Instruction::Add) ||
-       isReassociableOp(Sub->use_back(), Instruction::Sub)))
+      (isReassociableOp(Sub->user_back(), Instruction::Add) ||
+       isReassociableOp(Sub->user_back(), Instruction::Sub)))
     return true;
 
   return false;
@@ -1293,7 +1291,7 @@ Value *Reassociate::OptimizeXor(Instruction *I,
   //  the same symbolic value cluster together. For instance, the input operand
   //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
   //  ("x | 123", "x & 789", "y & 456").
-  std::sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor());
+  std::stable_sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor());
 
   // Step 3: Combine adjacent operands
   XorOpnd *PrevOpnd = 0;
@@ -1548,19 +1546,6 @@ Value *Reassociate::OptimizeAdd(Instruction *I,
   return 0;
 }
 
-namespace {
-  /// \brief Predicate tests whether a ValueEntry's op is in a map.
-  struct IsValueInMap {
-    const DenseMap<Value *, unsigned> &Map;
-
-    IsValueInMap(const DenseMap<Value *, unsigned> &Map) : Map(Map) {}
-
-    bool operator()(const ValueEntry &Entry) {
-      return Map.find(Entry.Op) != Map.end();
-    }
-  };
-}
-
 /// \brief Build up a vector of value/power pairs factoring a product.
 ///
 /// Given a series of multiplication operands, build a vector of factors and
@@ -1619,7 +1604,7 @@ bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
   // below our mininum of '4'.
   assert(FactorPowerSum >= 4);
 
-  std::sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter());
+  std::stable_sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter());
   return true;
 }
 
@@ -1795,9 +1780,9 @@ void Reassociate::EraseInst(Instruction *I) {
       // If this is a node in an expression tree, climb to the expression root
       // and add that since that's where optimization actually happens.
       unsigned Opcode = Op->getOpcode();
-      while (Op->hasOneUse() && Op->use_back()->getOpcode() == Opcode &&
+      while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
              Visited.insert(Op))
-        Op = Op->use_back();
+        Op = Op->user_back();
       RedoInsts.insert(Op);
     }
 }
@@ -1815,8 +1800,8 @@ void Reassociate::OptimizeInst(Instruction *I) {
     // is used by a reassociable multiply or add, turn into a multiply.
     if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
         (I->hasOneUse() &&
-         (isReassociableOp(I->use_back(), Instruction::Mul) ||
-          isReassociableOp(I->use_back(), Instruction::Add)))) {
+         (isReassociableOp(I->user_back(), Instruction::Mul) ||
+          isReassociableOp(I->user_back(), Instruction::Add)))) {
       Instruction *NI = ConvertShiftToMul(I);
       RedoInsts.insert(I);
       MadeChange = true;
@@ -1869,7 +1854,7 @@ void Reassociate::OptimizeInst(Instruction *I) {
       // and if this is not an inner node of a multiply tree.
       if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
           (!I->hasOneUse() ||
-           !isReassociableOp(I->use_back(), Instruction::Mul))) {
+           !isReassociableOp(I->user_back(), Instruction::Mul))) {
         Instruction *NI = LowerNegateToMultiply(I);
         RedoInsts.insert(I);
         MadeChange = true;
@@ -1885,13 +1870,13 @@ void Reassociate::OptimizeInst(Instruction *I) {
   // If this is an interior node of a reassociable tree, ignore it until we
   // get to the root of the tree, to avoid N^2 analysis.
   unsigned Opcode = BO->getOpcode();
-  if (BO->hasOneUse() && BO->use_back()->getOpcode() == Opcode)
+  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode)
     return;
 
   // If this is an add tree that is used by a sub instruction, ignore it
   // until we process the subtract.
   if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
-      cast<Instruction>(BO->use_back())->getOpcode() == Instruction::Sub)
+      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
     return;
 
   ReassociateExpression(BO);
@@ -1943,7 +1928,7 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) {
   // In this case we reassociate to put the negation on the outside so that we
   // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
   if (I->getOpcode() == Instruction::Mul && I->hasOneUse() &&
-      cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add &&
+      cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
       isa<ConstantInt>(Ops.back().Op) &&
       cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
     ValueEntry Tmp = Ops.pop_back_val();
@@ -1972,6 +1957,9 @@ void Reassociate::ReassociateExpression(BinaryOperator *I) {
 }
 
 bool Reassociate::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   // Calculate the rank map for F
   BuildRankMap(F);
 
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index 07f540a..d9809ce 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -20,12 +20,12 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
@@ -40,23 +40,22 @@ namespace {
       initializeRegToMemPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequiredID(BreakCriticalEdgesID);
       AU.addPreservedID(BreakCriticalEdgesID);
     }
 
-   bool valueEscapes(const Instruction *Inst) const {
-     const BasicBlock *BB = Inst->getParent();
-      for (Value::const_use_iterator UI = Inst->use_begin(),E = Inst->use_end();
-           UI != E; ++UI) {
-        const Instruction *I = cast<Instruction>(*UI);
-        if (I->getParent() != BB || isa<PHINode>(I))
+    bool valueEscapes(const Instruction *Inst) const {
+      const BasicBlock *BB = Inst->getParent();
+      for (const User *U : Inst->users()) {
+        const Instruction *UI = cast<Instruction>(U);
+        if (UI->getParent() != BB || isa<PHINode>(UI))
           return true;
       }
       return false;
     }
 
-    virtual bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
   };
 }
 
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 4364720..b8f10e9 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -26,13 +26,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/InstVisitor.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -153,7 +153,7 @@ namespace {
 /// Constant Propagation.
 ///
 class SCCPSolver : public InstVisitor<SCCPSolver> {
-  const DataLayout *TD;
+  const DataLayout *DL;
   const TargetLibraryInfo *TLI;
   SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
   DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
@@ -205,8 +205,8 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   typedef std::pair<BasicBlock*, BasicBlock*> Edge;
   DenseSet<Edge> KnownFeasibleEdges;
 public:
-  SCCPSolver(const DataLayout *td, const TargetLibraryInfo *tli)
-    : TD(td), TLI(tli) {}
+  SCCPSolver(const DataLayout *DL, const TargetLibraryInfo *tli)
+    : DL(DL), TLI(tli) {}
 
   /// MarkBlockExecutable - This method can be used by clients to mark all of
   /// the blocks that are known to be intrinsically live in the processed unit.
@@ -491,7 +491,6 @@ private:
   }
   void visitCallSite      (CallSite CS);
   void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
-  void visitUnwindInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
   void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); }
@@ -1067,7 +1066,7 @@ void SCCPSolver::visitLoadInst(LoadInst &I) {
   }
 
   // Transform load from a constant into a constant if possible.
-  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, TD))
+  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, DL))
     return markConstant(IV, &I, C);
 
   // Otherwise we cannot say for certain what value this load will produce.
@@ -1181,10 +1180,9 @@ void SCCPSolver::Solve() {
       // since all of its users will have already been marked as overdefined
       // Update all of the users of this instruction's value.
       //
-      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-           UI != E; ++UI)
-        if (Instruction *I = dyn_cast<Instruction>(*UI))
-          OperandChangedState(I);
+      for (User *U : I->users())
+        if (Instruction *UI = dyn_cast<Instruction>(U))
+          OperandChangedState(UI);
     }
 
     // Process the instruction work list.
@@ -1201,10 +1199,9 @@ void SCCPSolver::Solve() {
       // Update all of the users of this instruction's value.
       //
       if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
-        for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-             UI != E; ++UI)
-          if (Instruction *I = dyn_cast<Instruction>(*UI))
-            OperandChangedState(I);
+        for (User *U : I->users())
+          if (Instruction *UI = dyn_cast<Instruction>(U))
+            OperandChangedState(UI);
     }
 
     // Process the basic block work list.
@@ -1499,7 +1496,7 @@ namespace {
   /// Sparse Conditional Constant Propagator.
   ///
   struct SCCP : public FunctionPass {
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<TargetLibraryInfo>();
     }
     static char ID; // Pass identification, replacement for typeid
@@ -1510,7 +1507,7 @@ namespace {
     // runOnFunction - Run the Sparse Conditional Constant Propagation
     // algorithm, and return true if the function was modified.
     //
-    bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
   };
 } // end anonymous namespace
 
@@ -1553,10 +1550,14 @@ static void DeleteInstructionInBlock(BasicBlock *BB) {
 // and return true if the function was modified.
 //
 bool SCCP::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
-  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  const DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
-  SCCPSolver Solver(TD, TLI);
+  SCCPSolver Solver(DL, TLI);
 
   // Mark the first block of the function as being executable.
   Solver.MarkBlockExecutable(F.begin());
@@ -1628,14 +1629,14 @@ namespace {
   /// Constant Propagation.
   ///
   struct IPSCCP : public ModulePass {
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<TargetLibraryInfo>();
     }
     static char ID;
     IPSCCP() : ModulePass(ID) {
       initializeIPSCCPPass(*PassRegistry::getPassRegistry());
     }
-    bool runOnModule(Module &M);
+    bool runOnModule(Module &M) override;
   };
 } // end anonymous namespace
 
@@ -1658,21 +1659,20 @@ static bool AddressIsTaken(const GlobalValue *GV) {
   // Delete any dead constantexpr klingons.
   GV->removeDeadConstantUsers();
 
-  for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end();
-       UI != E; ++UI) {
-    const User *U = *UI;
-    if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+  for (const Use &U : GV->uses()) {
+    const User *UR = U.getUser();
+    if (const StoreInst *SI = dyn_cast<StoreInst>(UR)) {
       if (SI->getOperand(0) == GV || SI->isVolatile())
         return true;  // Storing addr of GV.
-    } else if (isa<InvokeInst>(U) || isa<CallInst>(U)) {
+    } else if (isa<InvokeInst>(UR) || isa<CallInst>(UR)) {
       // Make sure we are calling the function, not passing the address.
-      ImmutableCallSite CS(cast<Instruction>(U));
-      if (!CS.isCallee(UI))
+      ImmutableCallSite CS(cast<Instruction>(UR));
+      if (!CS.isCallee(&U))
         return true;
-    } else if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+    } else if (const LoadInst *LI = dyn_cast<LoadInst>(UR)) {
       if (LI->isVolatile())
         return true;
-    } else if (isa<BlockAddress>(U)) {
+    } else if (isa<BlockAddress>(UR)) {
       // blockaddress doesn't take the address of the function, it takes addr
       // of label.
     } else {
@@ -1683,9 +1683,10 @@ static bool AddressIsTaken(const GlobalValue *GV) {
 }
 
 bool IPSCCP::runOnModule(Module &M) {
-  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
-  SCCPSolver Solver(TD, TLI);
+  SCCPSolver Solver(DL, TLI);
 
   // AddressTakenFunctions - This set keeps track of the address-taken functions
   // that are in the input.  As IPSCCP runs through and simplifies code,
@@ -1834,8 +1835,9 @@ bool IPSCCP::runOnModule(Module &M) {
     for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
       // If there are any PHI nodes in this successor, drop entries for BB now.
       BasicBlock *DeadBB = BlocksToErase[i];
-      for (Value::use_iterator UI = DeadBB->use_begin(), UE = DeadBB->use_end();
-           UI != UE; ) {
+      for (Value::user_iterator UI = DeadBB->user_begin(),
+                                UE = DeadBB->user_end();
+           UI != UE;) {
         // Grab the user and then increment the iterator early, as the user
         // will be deleted. Step past all adjacent uses from the same user.
         Instruction *I = dyn_cast<Instruction>(*UI);
@@ -1925,7 +1927,7 @@ bool IPSCCP::runOnModule(Module &M) {
            "Overdefined values should have been taken out of the map!");
     DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
     while (!GV->use_empty()) {
-      StoreInst *SI = cast<StoreInst>(GV->use_back());
+      StoreInst *SI = cast<StoreInst>(GV->user_back());
       SI->eraseFromParent();
     }
     M.getGlobalList().erase(GV);
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 9f3fc83..ed5e618 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -29,32 +29,39 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/DIBuilder.h"
-#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/InstVisitor.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TimeValue.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+
+#if __cplusplus >= 201103L && !defined(NDEBUG)
+// We only use this for a debug check in C++11
+#include <random>
+#endif
+
 using namespace llvm;
 
 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
@@ -73,6 +80,16 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates");
 static cl::opt<bool>
 ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
 
+/// Hidden option to enable randomly shuffling the slices to help uncover
+/// instability in their order.
+static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
+                                             cl::init(false), cl::Hidden);
+
+/// Hidden option to experiment with completely strict handling of inbounds
+/// GEPs.
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds",
+                                        cl::init(false), cl::Hidden);
+
 namespace {
 /// \brief A custom IRBuilder inserter which prefixes all names if they are
 /// preserved.
@@ -244,8 +261,8 @@ public:
   void printUse(raw_ostream &OS, const_iterator I,
                 StringRef Indent = "  ") const;
   void print(raw_ostream &OS) const;
-  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const;
-  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const;
+  void dump(const_iterator I) const;
+  void dump() const;
 #endif
 
 private:
@@ -339,7 +356,7 @@ private:
                  bool IsSplittable = false) {
     // Completely skip uses which have a zero size or start either before or
     // past the end of the allocation.
-    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) {
+    if (Size == 0 || Offset.uge(AllocSize)) {
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
                    << " which has zero size or starts outside of the "
                    << AllocSize << " byte alloca:\n"
@@ -380,6 +397,43 @@ private:
     if (GEPI.use_empty())
       return markAsDead(GEPI);
 
+    if (SROAStrictInbounds && GEPI.isInBounds()) {
+      // FIXME: This is a manually un-factored variant of the basic code inside
+      // of GEPs with checking of the inbounds invariant specified in the
+      // langref in a very strict sense. If we ever want to enable
+      // SROAStrictInbounds, this code should be factored cleanly into
+      // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
+      // by writing out the code here where we have tho underlying allocation
+      // size readily available.
+      APInt GEPOffset = Offset;
+      for (gep_type_iterator GTI = gep_type_begin(GEPI),
+                             GTE = gep_type_end(GEPI);
+           GTI != GTE; ++GTI) {
+        ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+        if (!OpC)
+          break;
+
+        // Handle a struct index, which adds its field offset to the pointer.
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          unsigned ElementIdx = OpC->getZExtValue();
+          const StructLayout *SL = DL.getStructLayout(STy);
+          GEPOffset +=
+              APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
+        } else {
+          // For array or vector indices, scale the index by the size of the type.
+          APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+          GEPOffset += Index * APInt(Offset.getBitWidth(),
+                                     DL.getTypeAllocSize(GTI.getIndexedType()));
+        }
+
+        // If this index has computed an intermediate pointer which is not
+        // inbounds, then the result of the GEP is a poison value and we can
+        // delete it and all uses.
+        if (GEPOffset.ugt(AllocSize))
+          return markAsDead(GEPI);
+      }
+    }
+
     return Base::visitGetElementPtrInst(GEPI);
   }
 
@@ -426,8 +480,7 @@ private:
     // risk of overflow.
     // FIXME: We should instead consider the pointer to have escaped if this
     // function is being instrumented for addressing bugs or race conditions.
-    if (Offset.isNegative() || Size > AllocSize ||
-        Offset.ugt(AllocSize - Size)) {
+    if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
                    << " which extends past the end of the " << AllocSize
                    << " byte alloca:\n"
@@ -446,7 +499,7 @@ private:
     assert(II.getRawDest() == *U && "Pointer use is not the destination?");
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
     if ((Length && Length->getValue() == 0) ||
-        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+        (IsOffsetKnown && Offset.uge(AllocSize)))
       // Zero-length mem transfer intrinsics can be ignored entirely.
       return markAsDead(II);
 
@@ -461,14 +514,30 @@ private:
 
   void visitMemTransferInst(MemTransferInst &II) {
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    if ((Length && Length->getValue() == 0) ||
-        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+    if (Length && Length->getValue() == 0)
       // Zero-length mem transfer intrinsics can be ignored entirely.
       return markAsDead(II);
 
+    // Because we can visit these intrinsics twice, also check to see if the
+    // first time marked this instruction as dead. If so, skip it.
+    if (VisitedDeadInsts.count(&II))
+      return;
+
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
 
+    // This side of the transfer is completely out-of-bounds, and so we can
+    // nuke the entire transfer. However, we also need to nuke the other side
+    // if already added to our partitions.
+    // FIXME: Yet another place we really should bypass this when
+    // instrumenting for ASan.
+    if (Offset.uge(AllocSize)) {
+      SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II);
+      if (MTPI != MemTransferSliceMap.end())
+        S.Slices[MTPI->second].kill();
+      return markAsDead(II);
+    }
+
     uint64_t RawOffset = Offset.getLimitedValue();
     uint64_t Size = Length ? Length->getLimitedValue()
                            : AllocSize - RawOffset;
@@ -487,7 +556,7 @@ private:
     // they both point to the same alloca.
     bool Inserted;
     SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
-    llvm::tie(MTPI, Inserted) =
+    std::tie(MTPI, Inserted) =
         MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size()));
     unsigned PrevIdx = MTPI->second;
     if (!Inserted) {
@@ -546,7 +615,7 @@ private:
     Size = 0;
     do {
       Instruction *I, *UsedI;
-      llvm::tie(UsedI, I) = Uses.pop_back_val();
+      std::tie(UsedI, I) = Uses.pop_back_val();
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
         Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
@@ -568,10 +637,9 @@ private:
         return I;
       }
 
-      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
-           ++UI)
-        if (Visited.insert(cast<Instruction>(*UI)))
-          Uses.push_back(std::make_pair(I, cast<Instruction>(*UI)));
+      for (User *U : I->users())
+        if (Visited.insert(cast<Instruction>(U)))
+          Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
     } while (!Uses.empty());
 
     return 0;
@@ -597,8 +665,7 @@ private:
     // themselves which should be replaced with undef.
     // FIXME: This should instead be escaped in the event we're instrumenting
     // for address sanitization.
-    if ((Offset.isNegative() && (-Offset).uge(PHISize)) ||
-        (!Offset.isNegative() && Offset.uge(AllocSize))) {
+    if (Offset.uge(AllocSize)) {
       S.DeadOperands.push_back(U);
       return;
     }
@@ -638,8 +705,7 @@ private:
     // themselves which should be replaced with undef.
     // FIXME: This should instead be escaped in the event we're instrumenting
     // for address sanitization.
-    if ((Offset.isNegative() && Offset.uge(SelectSize)) ||
-        (!Offset.isNegative() && Offset.uge(AllocSize))) {
+    if (Offset.uge(AllocSize)) {
       S.DeadOperands.push_back(U);
       return;
     }
@@ -674,6 +740,13 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
                               std::mem_fun_ref(&Slice::isDead)),
                Slices.end());
 
+#if __cplusplus >= 201103L && !defined(NDEBUG)
+  if (SROARandomShuffleSlices) {
+    std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
+    std::shuffle(Slices.begin(), Slices.end(), MT);
+  }
+#endif
+
   // Sort the uses. This arranges for the offsets to be in ascending order,
   // and the sizes to be in descending order.
   std::sort(Slices.begin(), Slices.end());
@@ -712,8 +785,10 @@ void AllocaSlices::print(raw_ostream &OS) const {
     print(OS, I);
 }
 
-void AllocaSlices::dump(const_iterator I) const { print(dbgs(), I); }
-void AllocaSlices::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
+  print(dbgs(), I);
+}
+LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
 
 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
@@ -741,12 +816,10 @@ public:
     // Retain the debug information attached to the alloca for use when
     // rewriting loads and stores.
     if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
-      for (Value::use_iterator UI = DebugNode->use_begin(),
-                               UE = DebugNode->use_end();
-           UI != UE; ++UI)
-        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+      for (User *U : DebugNode->users())
+        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
           DDIs.push_back(DDI);
-        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI))
+        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
           DVIs.push_back(DVI);
     }
 
@@ -760,8 +833,8 @@ public:
       DVIs.pop_back_val()->eraseFromParent();
   }
 
-  virtual bool isInstInList(Instruction *I,
-                            const SmallVectorImpl<Instruction*> &Insts) const {
+  bool isInstInList(Instruction *I,
+                    const SmallVectorImpl<Instruction*> &Insts) const override {
     Value *Ptr;
     if (LoadInst *LI = dyn_cast<LoadInst>(I))
       Ptr = LI->getOperand(0);
@@ -788,7 +861,7 @@ public:
     return false;
   }
 
-  virtual void updateDebugInfo(Instruction *Inst) const {
+  void updateDebugInfo(Instruction *Inst) const override {
     for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
@@ -899,10 +972,10 @@ public:
         C(0), DL(0), DT(0) {
     initializeSROAPass(*PassRegistry::getPassRegistry());
   }
-  bool runOnFunction(Function &F);
-  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 
-  const char *getPassName() const { return "SROA"; }
+  const char *getPassName() const override { return "SROA"; }
   static char ID;
 
 private:
@@ -915,6 +988,7 @@ private:
                         ArrayRef<AllocaSlices::iterator> SplitUses);
   bool splitAlloca(AllocaInst &AI, AllocaSlices &S);
   bool runOnAlloca(AllocaInst &AI);
+  void clobberUse(Use &U);
   void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
   bool promoteAllocas(Function &F);
 };
@@ -928,7 +1002,7 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
 
 INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
                     false, false)
 
@@ -938,7 +1012,11 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
                             AllocaSlices::const_iterator E,
                             uint64_t EndOffset) {
   Type *Ty = 0;
-  bool IgnoreNonIntegralTypes = false;
+  bool TyIsCommon = true;
+  IntegerType *ITy = 0;
+
+  // Note that we need to look at *every* alloca slice's Use to ensure we
+  // always get consistent results regardless of the order of slices.
   for (AllocaSlices::const_iterator I = B; I != E; ++I) {
     Use *U = I->getUse();
     if (isa<IntrinsicInst>(*U->getUser()))
@@ -951,37 +1029,30 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
       UserTy = LI->getType();
     } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
       UserTy = SI->getValueOperand()->getType();
-    } else {
-      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
-      continue;
     }
 
-    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
+    if (!UserTy || (Ty && Ty != UserTy))
+      TyIsCommon = false; // Give up on anything but an iN type.
+    else
+      Ty = UserTy;
+
+    if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
       // If the type is larger than the partition, skip it. We only encounter
       // this for split integer operations where we want to use the type of the
       // entity causing the split. Also skip if the type is not a byte width
       // multiple.
-      if (ITy->getBitWidth() % 8 != 0 ||
-          ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+      if (UserITy->getBitWidth() % 8 != 0 ||
+          UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
         continue;
 
-      // If we have found an integer type use covering the alloca, use that
-      // regardless of the other types, as integers are often used for
-      // a "bucket of bits" type.
-      //
-      // NB: This *must* be the only return from inside the loop so that the
-      // order of slices doesn't impact the computed type.
-      return ITy;
-    } else if (IgnoreNonIntegralTypes) {
-      continue;
+      // Track the largest bitwidth integer type used in this way in case there
+      // is no common type.
+      if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
+        ITy = UserITy;
     }
-
-    if (Ty && Ty != UserTy)
-      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
-
-    Ty = UserTy;
   }
-  return Ty;
+
+  return TyIsCommon ? Ty : ITy;
 }
 
 /// PHI instructions that use an alloca and are subsequently loaded can be
@@ -1011,9 +1082,8 @@ static bool isSafePHIToSpeculate(PHINode &PN,
   BasicBlock *BB = PN.getParent();
   unsigned MaxAlign = 0;
   bool HaveLoad = false;
-  for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); UI != UE;
-       ++UI) {
-    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+  for (User *U : PN.users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
     if (LI == 0 || !LI->isSimple())
       return false;
 
@@ -1077,13 +1147,13 @@ static void speculatePHINodeLoads(PHINode &PN) {
 
   // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
   // matter which one we get and if any differ.
-  LoadInst *SomeLoad = cast<LoadInst>(*PN.use_begin());
+  LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
   MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
   unsigned Align = SomeLoad->getAlignment();
 
   // Rewrite all loads of the PN to use the new PHI.
   while (!PN.use_empty()) {
-    LoadInst *LI = cast<LoadInst>(*PN.use_begin());
+    LoadInst *LI = cast<LoadInst>(PN.user_back());
     LI->replaceAllUsesWith(NewPN);
     LI->eraseFromParent();
   }
@@ -1127,9 +1197,8 @@ static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) {
   bool TDerefable = TValue->isDereferenceablePointer();
   bool FDerefable = FValue->isDereferenceablePointer();
 
-  for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); UI != UE;
-       ++UI) {
-    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+  for (User *U : SI.users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
     if (LI == 0 || !LI->isSimple())
       return false;
 
@@ -1155,7 +1224,7 @@ static void speculateSelectInstLoads(SelectInst &SI) {
   Value *FV = SI.getFalseValue();
   // Replace the loads of the select with a select of two loads.
   while (!SI.use_empty()) {
-    LoadInst *LI = cast<LoadInst>(*SI.use_begin());
+    LoadInst *LI = cast<LoadInst>(SI.user_back());
     assert(LI->isSimple() && "We only speculate simple loads");
 
     IRB.SetInsertPoint(LI);
@@ -1188,7 +1257,7 @@ static void speculateSelectInstLoads(SelectInst &SI) {
 /// This will return the BasePtr if that is valid, or build a new GEP
 /// instruction using the IRBuilder if GEP-ing is needed.
 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
-                       SmallVectorImpl<Value *> &Indices) {
+                       SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
   if (Indices.empty())
     return BasePtr;
 
@@ -1197,7 +1266,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
   if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
     return BasePtr;
 
-  return IRB.CreateInBoundsGEP(BasePtr, Indices, "idx");
+  return IRB.CreateInBoundsGEP(BasePtr, Indices, NamePrefix + "sroa_idx");
 }
 
 /// \brief Get a natural GEP off of the BasePtr walking through Ty toward
@@ -1211,9 +1280,13 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
 /// indicated by Indices to have the correct offset.
 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
                                     Value *BasePtr, Type *Ty, Type *TargetTy,
-                                    SmallVectorImpl<Value *> &Indices) {
+                                    SmallVectorImpl<Value *> &Indices,
+                                    Twine NamePrefix) {
   if (Ty == TargetTy)
-    return buildGEP(IRB, BasePtr, Indices);
+    return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+
+  // Pointer size to use for the indices.
+  unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType());
 
   // See if we can descend into a struct and locate a field with the correct
   // type.
@@ -1222,11 +1295,13 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
   do {
     if (ElementTy->isPointerTy())
       break;
-    if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
-      ElementTy = SeqTy->getElementType();
-      // Note that we use the default address space as this index is over an
-      // array or a vector, not a pointer.
-      Indices.push_back(IRB.getInt(APInt(DL.getPointerSizeInBits(0), 0)));
+
+    if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
+      ElementTy = ArrayTy->getElementType();
+      Indices.push_back(IRB.getIntN(PtrSize, 0));
+    } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
+      ElementTy = VectorTy->getElementType();
+      Indices.push_back(IRB.getInt32(0));
     } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
       if (STy->element_begin() == STy->element_end())
         break; // Nothing left to descend into.
@@ -1240,7 +1315,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
   if (ElementTy != TargetTy)
     Indices.erase(Indices.end() - NumLayers, Indices.end());
 
-  return buildGEP(IRB, BasePtr, Indices);
+  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
 }
 
 /// \brief Recursively compute indices for a natural GEP.
@@ -1250,9 +1325,10 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
 static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
                                        Value *Ptr, Type *Ty, APInt &Offset,
                                        Type *TargetTy,
-                                       SmallVectorImpl<Value *> &Indices) {
+                                       SmallVectorImpl<Value *> &Indices,
+                                       Twine NamePrefix) {
   if (Offset == 0)
-    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices);
+    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix);
 
   // We can't recurse through pointer types.
   if (Ty->isPointerTy())
@@ -1272,7 +1348,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
     return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
-                                    Offset, TargetTy, Indices);
+                                    Offset, TargetTy, Indices, NamePrefix);
   }
 
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
@@ -1285,7 +1361,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
     return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
-                                    Indices);
+                                    Indices, NamePrefix);
   }
 
   StructType *STy = dyn_cast<StructType>(Ty);
@@ -1304,7 +1380,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
 
   Indices.push_back(IRB.getInt32(Index));
   return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices);
+                                  Indices, NamePrefix);
 }
 
 /// \brief Get a natural GEP from a base pointer to a particular offset and
@@ -1319,12 +1395,13 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
 /// If no natural GEP can be constructed, this function returns null.
 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
                                       Value *Ptr, APInt Offset, Type *TargetTy,
-                                      SmallVectorImpl<Value *> &Indices) {
+                                      SmallVectorImpl<Value *> &Indices,
+                                      Twine NamePrefix) {
   PointerType *Ty = cast<PointerType>(Ptr->getType());
 
   // Don't consider any GEPs through an i8* as natural unless the TargetTy is
   // an i8.
-  if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8))
+  if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
     return 0;
 
   Type *ElementTy = Ty->getElementType();
@@ -1338,7 +1415,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
   Offset -= NumSkippedElements * ElementSize;
   Indices.push_back(IRB.getInt(NumSkippedElements));
   return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices);
+                                  Indices, NamePrefix);
 }
 
 /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
@@ -1356,8 +1433,9 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
 /// properties. The algorithm tries to fold as many constant indices into
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
-static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL,
-                             Value *Ptr, APInt Offset, Type *PointerTy) {
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
+                             APInt Offset, Type *PointerTy,
+                             Twine NamePrefix) {
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
@@ -1391,7 +1469,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL,
     // See if we can perform a natural GEP here.
     Indices.clear();
     if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
-                                           Indices)) {
+                                           Indices, NamePrefix)) {
       if (P->getType() == PointerTy) {
         // Zap any offset pointer that we ended up computing in previous rounds.
         if (OffsetPtr && OffsetPtr->use_empty())
@@ -1425,20 +1503,21 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL,
 
   if (!OffsetPtr) {
     if (!Int8Ptr) {
-      Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
-                                  "raw_cast");
+      Int8Ptr = IRB.CreateBitCast(
+          Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
+          NamePrefix + "sroa_raw_cast");
       Int8PtrOffset = Offset;
     }
 
     OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
       IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
-                            "raw_idx");
+                            NamePrefix + "sroa_raw_idx");
   }
   Ptr = OffsetPtr;
 
   // On the off chance we were targeting i8*, guard the bitcast here.
   if (Ptr->getType() != PointerTy)
-    Ptr = IRB.CreateBitCast(Ptr, PointerTy, "cast");
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast");
 
   return Ptr;
 }
@@ -1931,16 +2010,22 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
   // integer type will be stored here for easy access during rewriting.
   IntegerType *IntTy;
 
-  // The offset of the slice currently being rewritten.
+  // The original offset of the slice currently being rewritten relative to
+  // the original alloca.
   uint64_t BeginOffset, EndOffset;
+  // The new offsets of the slice currently being rewritten relative to the
+  // original alloca.
+  uint64_t NewBeginOffset, NewEndOffset;
+
+  uint64_t SliceSize;
   bool IsSplittable;
   bool IsSplit;
   Use *OldUse;
   Instruction *OldPtr;
 
-  // Output members carrying state about the result of visiting and rewriting
-  // the slice of the alloca.
-  bool IsUsedByRewrittenSpeculatableInstructions;
+  // Track post-rewrite users which are PHI nodes and Selects.
+  SmallPtrSetImpl<PHINode *> &PHIUsers;
+  SmallPtrSetImpl<SelectInst *> &SelectUsers;
 
   // Utility IR builder, whose name prefix is setup for each visited use, and
   // the insertion point is set to point to the user.
@@ -1949,11 +2034,14 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 public:
   AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass,
                       AllocaInst &OldAI, AllocaInst &NewAI,
-                      uint64_t NewBeginOffset, uint64_t NewEndOffset,
-                      bool IsVectorPromotable = false,
-                      bool IsIntegerPromotable = false)
+                      uint64_t NewAllocaBeginOffset,
+                      uint64_t NewAllocaEndOffset, bool IsVectorPromotable,
+                      bool IsIntegerPromotable,
+                      SmallPtrSetImpl<PHINode *> &PHIUsers,
+                      SmallPtrSetImpl<SelectInst *> &SelectUsers)
       : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
-        NewAllocaBeginOffset(NewBeginOffset), NewAllocaEndOffset(NewEndOffset),
+        NewAllocaBeginOffset(NewAllocaBeginOffset),
+        NewAllocaEndOffset(NewAllocaEndOffset),
         NewAllocaTy(NewAI.getAllocatedType()),
         VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0),
         ElementTy(VecTy ? VecTy->getElementType() : 0),
@@ -1964,7 +2052,7 @@ public:
                         DL.getTypeSizeInBits(NewAI.getAllocatedType()))
                   : 0),
         BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
-        OldPtr(), IsUsedByRewrittenSpeculatableInstructions(false),
+        OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
         IRB(NewAI.getContext(), ConstantFolder()) {
     if (VecTy) {
       assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
@@ -1983,6 +2071,14 @@ public:
     IsSplit =
         BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
 
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+    SliceSize = NewEndOffset - NewBeginOffset;
+
     OldUse = I->getUse();
     OldPtr = cast<Instruction>(OldUse->get());
 
@@ -1997,20 +2093,6 @@ public:
     return CanSROA;
   }
 
-  /// \brief Query whether this slice is used by speculatable instructions after
-  /// rewriting.
-  ///
-  /// These instructions (PHIs and Selects currently) require the alloca slice
-  /// to run back through the rewriter. Thus, they are promotable, but not on
-  /// this iteration. This is distinct from a slice which is unpromotable for
-  /// some other reason, in which case we don't even want to perform the
-  /// speculation. This can be querried at any time and reflects whether (at
-  /// that point) a visit call has rewritten a speculatable instruction on the
-  /// current slice.
-  bool isUsedByRewrittenSpeculatableInstructions() const {
-    return IsUsedByRewrittenSpeculatableInstructions;
-  }
-
 private:
   // Make sure the other visit overloads are visible.
   using Base::visit;
@@ -2021,30 +2103,53 @@ private:
     llvm_unreachable("No rewrite rule for this instruction!");
   }
 
-  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, uint64_t Offset,
-                              Type *PointerTy) {
-    assert(Offset >= NewAllocaBeginOffset);
-    return getAdjustedPtr(IRB, DL, &NewAI, APInt(DL.getPointerSizeInBits(),
-                                                 Offset - NewAllocaBeginOffset),
-                          PointerTy);
+  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
+    // Note that the offset computation can use BeginOffset or NewBeginOffset
+    // interchangeably for unsplit slices.
+    assert(IsSplit || BeginOffset == NewBeginOffset);
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+
+#ifndef NDEBUG
+    StringRef OldName = OldPtr->getName();
+    // Skip through the last '.sroa.' component of the name.
+    size_t LastSROAPrefix = OldName.rfind(".sroa.");
+    if (LastSROAPrefix != StringRef::npos) {
+      OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
+      // Look for an SROA slice index.
+      size_t IndexEnd = OldName.find_first_not_of("0123456789");
+      if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
+        // Strip the index and look for the offset.
+        OldName = OldName.substr(IndexEnd + 1);
+        size_t OffsetEnd = OldName.find_first_not_of("0123456789");
+        if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
+          // Strip the offset.
+          OldName = OldName.substr(OffsetEnd + 1);
+      }
+    }
+    // Strip any SROA suffixes as well.
+    OldName = OldName.substr(0, OldName.find(".sroa_"));
+#endif
+
+    return getAdjustedPtr(IRB, DL, &NewAI,
+                          APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+#ifndef NDEBUG
+                          Twine(OldName) + "."
+#else
+                          Twine()
+#endif
+                          );
   }
 
-  /// \brief Compute suitable alignment to access an offset into the new alloca.
-  unsigned getOffsetAlign(uint64_t Offset) {
+  /// \brief Compute suitable alignment to access this slice of the *new* alloca.
+  ///
+  /// You can optionally pass a type to this routine and if that type's ABI
+  /// alignment is itself suitable, this will return zero.
+  unsigned getSliceAlign(Type *Ty = 0) {
     unsigned NewAIAlign = NewAI.getAlignment();
     if (!NewAIAlign)
       NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
-    return MinAlign(NewAIAlign, Offset);
-  }
-
-  /// \brief Compute suitable alignment to access a type at an offset of the
-  /// new alloca.
-  ///
-  /// \returns zero if the type's ABI alignment is a suitable alignment,
-  /// otherwise returns the maximal suitable alignment.
-  unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) {
-    unsigned Align = getOffsetAlign(Offset);
-    return Align == DL.getABITypeAlignment(Ty) ? 0 : Align;
+    unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+    return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
   }
 
   unsigned getIndex(uint64_t Offset) {
@@ -2062,8 +2167,7 @@ private:
       Pass.DeadInsts.insert(I);
   }
 
-  Value *rewriteVectorizedLoadInst(uint64_t NewBeginOffset,
-                                   uint64_t NewEndOffset) {
+  Value *rewriteVectorizedLoadInst() {
     unsigned BeginIndex = getIndex(NewBeginOffset);
     unsigned EndIndex = getIndex(NewEndOffset);
     assert(EndIndex > BeginIndex && "Empty vector!");
@@ -2073,8 +2177,7 @@ private:
     return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   }
 
-  Value *rewriteIntegerLoad(LoadInst &LI, uint64_t NewBeginOffset,
-                            uint64_t NewEndOffset) {
+  Value *rewriteIntegerLoad(LoadInst &LI) {
     assert(IntTy && "We cannot insert an integer to the alloca");
     assert(!LI.isVolatile());
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
@@ -2093,32 +2196,23 @@ private:
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
 
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-
-    uint64_t Size = NewEndOffset - NewBeginOffset;
-
-    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8)
+    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
-      V = rewriteVectorizedLoadInst(NewBeginOffset, NewEndOffset);
+      V = rewriteVectorizedLoadInst();
     } else if (IntTy && LI.getType()->isIntegerTy()) {
-      V = rewriteIntegerLoad(LI, NewBeginOffset, NewEndOffset);
+      V = rewriteIntegerLoad(LI);
     } else if (NewBeginOffset == NewAllocaBeginOffset &&
                canConvertValue(DL, NewAllocaTy, LI.getType())) {
       V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                LI.isVolatile(), "load");
+                                LI.isVolatile(), LI.getName());
     } else {
       Type *LTy = TargetTy->getPointerTo();
-      V = IRB.CreateAlignedLoad(
-          getAdjustedAllocaPtr(IRB, NewBeginOffset, LTy),
-          getOffsetTypeAlign(TargetTy, NewBeginOffset - NewAllocaBeginOffset),
-          LI.isVolatile(), "load");
+      V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
+                                getSliceAlign(TargetTy), LI.isVolatile(),
+                                LI.getName());
       IsPtrAdjusted = true;
     }
     V = convertValue(DL, IRB, V, TargetTy);
@@ -2127,13 +2221,13 @@ private:
       assert(!LI.isVolatile());
       assert(LI.getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
-      assert(Size < DL.getTypeStoreSize(LI.getType()) &&
+      assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
              "Split load isn't smaller than original load");
       assert(LI.getType()->getIntegerBitWidth() ==
              DL.getTypeStoreSizeInBits(LI.getType()) &&
              "Non-byte-multiple bit width");
       // Move the insertion point just past the load so that we can refer to it.
-      IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
+      IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI)));
       // Create a placeholder value with the same type as LI to use as the
       // basis for the new value. This allows us to replace the uses of LI with
       // the computed value, and then replace the placeholder with LI, leaving
@@ -2155,9 +2249,7 @@ private:
     return !LI.isVolatile() && !IsPtrAdjusted;
   }
 
-  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
-                                  uint64_t NewBeginOffset,
-                                  uint64_t NewEndOffset) {
+  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
     if (V->getType() != VecTy) {
       unsigned BeginIndex = getIndex(NewBeginOffset);
       unsigned EndIndex = getIndex(NewEndOffset);
@@ -2183,8 +2275,7 @@ private:
     return true;
   }
 
-  bool rewriteIntegerStore(Value *V, StoreInst &SI,
-                           uint64_t NewBeginOffset, uint64_t NewEndOffset) {
+  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
     if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
@@ -2217,30 +2308,22 @@ private:
       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
         Pass.PostPromotionWorklist.insert(AI);
 
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-
-    uint64_t Size = NewEndOffset - NewBeginOffset;
-    if (Size < DL.getTypeStoreSize(V->getType())) {
+    if (SliceSize < DL.getTypeStoreSize(V->getType())) {
       assert(!SI.isVolatile());
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
       assert(V->getType()->getIntegerBitWidth() ==
              DL.getTypeStoreSizeInBits(V->getType()) &&
              "Non-byte-multiple bit width");
-      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
+      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
       V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset,
                          "extract");
     }
 
     if (VecTy)
-      return rewriteVectorizedStoreInst(V, SI, OldOp, NewBeginOffset,
-                                        NewEndOffset);
+      return rewriteVectorizedStoreInst(V, SI, OldOp);
     if (IntTy && V->getType()->isIntegerTy())
-      return rewriteIntegerStore(V, SI, NewBeginOffset, NewEndOffset);
+      return rewriteIntegerStore(V, SI);
 
     StoreInst *NewSI;
     if (NewBeginOffset == NewAllocaBeginOffset &&
@@ -2250,12 +2333,9 @@ private:
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                      SI.isVolatile());
     } else {
-      Value *NewPtr = getAdjustedAllocaPtr(IRB, NewBeginOffset,
-                                           V->getType()->getPointerTo());
-      NewSI = IRB.CreateAlignedStore(
-          V, NewPtr, getOffsetTypeAlign(
-                         V->getType(), NewBeginOffset - NewAllocaBeginOffset),
-          SI.isVolatile());
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+      NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
+                                     SI.isVolatile());
     }
     (void)NewSI;
     Pass.DeadInsts.insert(&SI);
@@ -2307,11 +2387,10 @@ private:
     // pointer to the new alloca.
     if (!isa<Constant>(II.getLength())) {
       assert(!IsSplit);
-      assert(BeginOffset >= NewAllocaBeginOffset);
-      II.setDest(
-          getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType()));
+      assert(NewBeginOffset == BeginOffset);
+      II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
       Type *CstTy = II.getAlignmentCst()->getType();
-      II.setAlignment(ConstantInt::get(CstTy, getOffsetAlign(BeginOffset)));
+      II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
 
       deleteIfTriviallyDead(OldPtr);
       return false;
@@ -2323,13 +2402,6 @@ private:
     Type *AllocaTy = NewAI.getAllocatedType();
     Type *ScalarTy = AllocaTy->getScalarType();
 
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-    uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset;
-
     // If this doesn't map cleanly onto the alloca type, and that type isn't
     // a single value type, just emit a memset.
     if (!VecTy && !IntTy &&
@@ -2341,8 +2413,8 @@ private:
       Type *SizeTy = II.getLength()->getType();
       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
       CallInst *New = IRB.CreateMemSet(
-          getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getRawDest()->getType()),
-          II.getValue(), Size, getOffsetAlign(SliceOffset), II.isVolatile());
+          getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
+          getSliceAlign(), II.isVolatile());
       (void)New;
       DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
@@ -2419,25 +2491,11 @@ private:
 
     DEBUG(dbgs() << "    original: " << II << "\n");
 
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-
-    assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
-    bool IsDest = II.getRawDest() == OldPtr;
-
-    // Compute the relative offset within the transfer.
-    unsigned IntPtrWidth = DL.getPointerSizeInBits();
-    APInt RelOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
+    bool IsDest = &II.getRawDestUse() == OldUse;
+    assert((IsDest && II.getRawDest() == OldPtr) ||
+           (!IsDest && II.getRawSource() == OldPtr));
 
-    unsigned Align = II.getAlignment();
-    uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset;
-    if (Align > 1)
-      Align =
-          MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(),
-                   MinAlign(II.getAlignment(), getOffsetAlign(SliceOffset)));
+    unsigned SliceAlign = getSliceAlign();
 
     // For unsplit intrinsics, we simply modify the source and destination
     // pointers in place. This isn't just an optimization, it is a matter of
@@ -2447,19 +2505,20 @@ private:
     // memcpy, and so simply updating the pointers is the necessary for us to
     // update both source and dest of a single call.
     if (!IsSplittable) {
-      Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource();
+      Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
       if (IsDest)
-        II.setDest(
-            getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType()));
+        II.setDest(AdjustedPtr);
       else
-        II.setSource(getAdjustedAllocaPtr(IRB, BeginOffset,
-                                          II.getRawSource()->getType()));
+        II.setSource(AdjustedPtr);
 
-      Type *CstTy = II.getAlignmentCst()->getType();
-      II.setAlignment(ConstantInt::get(CstTy, Align));
+      if (II.getAlignment() > SliceAlign) {
+        Type *CstTy = II.getAlignmentCst()->getType();
+        II.setAlignment(
+            ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+      }
 
       DEBUG(dbgs() << "          to: " << II << "\n");
-      deleteIfTriviallyDead(OldOp);
+      deleteIfTriviallyDead(OldPtr);
       return false;
     }
     // For split transfer intrinsics we have an incredibly useful assurance:
@@ -2495,37 +2554,39 @@ private:
     // alloca that should be re-examined after rewriting this instruction.
     Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
     if (AllocaInst *AI
-          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
+          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+      assert(AI != &OldAI && AI != &NewAI &&
+             "Splittable transfers cannot reach the same alloca on both ends.");
       Pass.Worklist.insert(AI);
+    }
 
-    if (EmitMemCpy) {
-      Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
-                                : II.getRawDest()->getType();
+    Type *OtherPtrTy = OtherPtr->getType();
+    unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
+
+    // Compute the relative offset for the other pointer within the transfer.
+    unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
+    APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
+    unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
+                                   OtherOffset.zextOrTrunc(64).getZExtValue());
 
+    if (EmitMemCpy) {
       // Compute the other pointer, folding as much as possible to produce
       // a single, simple GEP in most cases.
-      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy);
+      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                OtherPtr->getName() + ".");
 
-      Value *OurPtr = getAdjustedAllocaPtr(
-          IRB, NewBeginOffset,
-          IsDest ? II.getRawDest()->getType() : II.getRawSource()->getType());
+      Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
       Type *SizeTy = II.getLength()->getType();
       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
 
-      CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
-                                       IsDest ? OtherPtr : OurPtr,
-                                       Size, Align, II.isVolatile());
+      CallInst *New = IRB.CreateMemCpy(
+          IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
+          MinAlign(SliceAlign, OtherAlign), II.isVolatile());
       (void)New;
       DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
     }
 
-    // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy
-    // is equivalent to 1, but that isn't true if we end up rewriting this as
-    // a load or store.
-    if (!Align)
-      Align = 1;
-
     bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
                          NewEndOffset == NewAllocaEndOffset;
     uint64_t Size = NewEndOffset - NewBeginOffset;
@@ -2535,22 +2596,30 @@ private:
     IntegerType *SubIntTy
       = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
 
-    Type *OtherPtrTy = NewAI.getType();
+    // Reset the other pointer type to match the register type we're going to
+    // use, but using the address space of the original other pointer.
     if (VecTy && !IsWholeAlloca) {
       if (NumElements == 1)
         OtherPtrTy = VecTy->getElementType();
       else
         OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
 
-      OtherPtrTy = OtherPtrTy->getPointerTo();
+      OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS);
     } else if (IntTy && !IsWholeAlloca) {
-      OtherPtrTy = SubIntTy->getPointerTo();
+      OtherPtrTy = SubIntTy->getPointerTo(OtherAS);
+    } else {
+      OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS);
     }
 
-    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy);
+    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                   OtherPtr->getName() + ".");
+    unsigned SrcAlign = OtherAlign;
     Value *DstPtr = &NewAI;
-    if (!IsDest)
+    unsigned DstAlign = SliceAlign;
+    if (!IsDest) {
       std::swap(SrcPtr, DstPtr);
+      std::swap(SrcAlign, DstAlign);
+    }
 
     Value *Src;
     if (VecTy && !IsWholeAlloca && !IsDest) {
@@ -2564,7 +2633,7 @@ private:
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
     } else {
-      Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
+      Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(),
                                   "copyload");
     }
 
@@ -2582,7 +2651,7 @@ private:
     }
 
     StoreInst *Store = cast<StoreInst>(
-      IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile()));
+        IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
     (void)Store;
     DEBUG(dbgs() << "          to: " << *Store << "\n");
     return !II.isVolatile();
@@ -2594,20 +2663,13 @@ private:
     DEBUG(dbgs() << "    original: " << II << "\n");
     assert(II.getArgOperand(1) == OldPtr);
 
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-
     // Record this instruction for deletion.
     Pass.DeadInsts.insert(&II);
 
     ConstantInt *Size
       = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
                          NewEndOffset - NewBeginOffset);
-    Value *Ptr =
-        getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getArgOperand(1)->getType());
+    Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
     Value *New;
     if (II.getIntrinsicID() == Intrinsic::lifetime_start)
       New = IRB.CreateLifetimeStart(Ptr, Size);
@@ -2628,28 +2690,22 @@ private:
     // as local as possible to the PHI. To do that, we re-use the location of
     // the old pointer, which necessarily must be in the right position to
     // dominate the PHI.
-    IRBuilderTy PtrBuilder(OldPtr);
-    PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
-                             ".");
+    IRBuilderTy PtrBuilder(IRB);
+    PtrBuilder.SetInsertPoint(OldPtr);
+    PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
 
-    Value *NewPtr =
-        getAdjustedAllocaPtr(PtrBuilder, BeginOffset, OldPtr->getType());
+    Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
     // Replace the operands which were using the old pointer.
     std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
 
     DEBUG(dbgs() << "          to: " << PN << "\n");
     deleteIfTriviallyDead(OldPtr);
 
-    // Check whether we can speculate this PHI node, and if so remember that
-    // fact and queue it up for another iteration after the speculation
-    // occurs.
-    if (isSafePHIToSpeculate(PN, &DL)) {
-      Pass.SpeculatablePHIs.insert(&PN);
-      IsUsedByRewrittenSpeculatableInstructions = true;
-      return true;
-    }
-
-    return false; // PHIs can't be promoted on their own.
+    // PHIs can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    PHIUsers.insert(&PN);
+    return true;
   }
 
   bool visitSelectInst(SelectInst &SI) {
@@ -2659,7 +2715,7 @@ private:
     assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
     assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
 
-    Value *NewPtr = getAdjustedAllocaPtr(IRB, BeginOffset, OldPtr->getType());
+    Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
     // Replace the operands which were using the old pointer.
     if (SI.getOperand(1) == OldPtr)
       SI.setOperand(1, NewPtr);
@@ -2669,16 +2725,11 @@ private:
     DEBUG(dbgs() << "          to: " << SI << "\n");
     deleteIfTriviallyDead(OldPtr);
 
-    // Check whether we can speculate this select instruction, and if so
-    // remember that fact and queue it up for another iteration after the
-    // speculation occurs.
-    if (isSafeSelectToSpeculate(SI, &DL)) {
-      Pass.SpeculatableSelects.insert(&SI);
-      IsUsedByRewrittenSpeculatableInstructions = true;
-      return true;
-    }
-
-    return false; // Selects can't be promoted on their own.
+    // Selects can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    SelectUsers.insert(&SI);
+    return true;
   }
 
 };
@@ -2726,10 +2777,9 @@ private:
   /// Enqueue all the users of the given instruction for further processing.
   /// This uses a set to de-duplicate users.
   void enqueueUsers(Instruction &I) {
-    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
-         ++UI)
-      if (Visited.insert(*UI))
-        Queue.push_back(&UI.getUse());
+    for (Use &U : I.uses())
+      if (Visited.insert(U.getUser()))
+        Queue.push_back(&U);
   }
 
   // Conservative default is to not rewrite anything.
@@ -3114,17 +3164,17 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
                << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI
                << "\n");
 
-  // Track the high watermark on several worklists that are only relevant for
+  // Track the high watermark on the worklist as it is only relevant for
   // promoted allocas. We will reset it to this point if the alloca is not in
   // fact scheduled for promotion.
   unsigned PPWOldSize = PostPromotionWorklist.size();
-  unsigned SPOldSize = SpeculatablePHIs.size();
-  unsigned SSOldSize = SpeculatableSelects.size();
   unsigned NumUses = 0;
+  SmallPtrSet<PHINode *, 8> PHIUsers;
+  SmallPtrSet<SelectInst *, 8> SelectUsers;
 
   AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset,
                                EndOffset, IsVectorPromotable,
-                               IsIntegerPromotable);
+                               IsIntegerPromotable, PHIUsers, SelectUsers);
   bool Promotable = true;
   for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
                                                         SUE = SplitUses.end();
@@ -3145,50 +3195,60 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
   MaxUsesPerAllocaPartition =
       std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
 
-  if (Promotable && !Rewriter.isUsedByRewrittenSpeculatableInstructions()) {
-    DEBUG(dbgs() << "  and queuing for promotion\n");
-    PromotableAllocas.push_back(NewAI);
-  } else if (NewAI != &AI ||
-             (Promotable &&
-              Rewriter.isUsedByRewrittenSpeculatableInstructions())) {
+  // Now that we've processed all the slices in the new partition, check if any
+  // PHIs or Selects would block promotion.
+  for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
+                                            E = PHIUsers.end();
+       I != E; ++I)
+    if (!isSafePHIToSpeculate(**I, DL)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+  for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
+                                               E = SelectUsers.end();
+       I != E; ++I)
+    if (!isSafeSelectToSpeculate(**I, DL)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+
+  if (Promotable) {
+    if (PHIUsers.empty() && SelectUsers.empty()) {
+      // Promote the alloca.
+      PromotableAllocas.push_back(NewAI);
+    } else {
+      // If we have either PHIs or Selects to speculate, add them to those
+      // worklists and re-queue the new alloca so that we promote in on the
+      // next iteration.
+      for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
+                                                E = PHIUsers.end();
+           I != E; ++I)
+        SpeculatablePHIs.insert(*I);
+      for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
+                                                   E = SelectUsers.end();
+           I != E; ++I)
+        SpeculatableSelects.insert(*I);
+      Worklist.insert(NewAI);
+    }
+  } else {
     // If we can't promote the alloca, iterate on it to check for new
     // refinements exposed by splitting the current alloca. Don't iterate on an
     // alloca which didn't actually change and didn't get promoted.
-    //
-    // Alternatively, if we could promote the alloca but have speculatable
-    // instructions then we will speculate them after finishing our processing
-    // of the original alloca. Mark the new one for re-visiting in the next
-    // iteration so the speculated operations can be rewritten.
-    //
-    // FIXME: We should actually track whether the rewriter changed anything.
-    Worklist.insert(NewAI);
-  }
-
-  // Drop any post-promotion work items if promotion didn't happen.
-  if (!Promotable) {
+    if (NewAI != &AI)
+      Worklist.insert(NewAI);
+
+    // Drop any post-promotion work items if promotion didn't happen.
     while (PostPromotionWorklist.size() > PPWOldSize)
       PostPromotionWorklist.pop_back();
-    while (SpeculatablePHIs.size() > SPOldSize)
-      SpeculatablePHIs.pop_back();
-    while (SpeculatableSelects.size() > SSOldSize)
-      SpeculatableSelects.pop_back();
   }
 
   return true;
 }
 
-namespace {
-struct IsSliceEndLessOrEqualTo {
-  uint64_t UpperBound;
-
-  IsSliceEndLessOrEqualTo(uint64_t UpperBound) : UpperBound(UpperBound) {}
-
-  bool operator()(const AllocaSlices::iterator &I) {
-    return I->endOffset() <= UpperBound;
-  }
-};
-}
-
 static void
 removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses,
                         uint64_t &MaxSplitUseEndOffset, uint64_t Offset) {
@@ -3200,7 +3260,9 @@ removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses,
 
   size_t SplitUsesOldSize = SplitUses.size();
   SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(),
-                                 IsSliceEndLessOrEqualTo(Offset)),
+                                 [Offset](const AllocaSlices::iterator &I) {
+                    return I->endOffset() <= Offset;
+                  }),
                   SplitUses.end());
   if (SplitUsesOldSize == SplitUses.size())
     return;
@@ -3227,7 +3289,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) {
 
   uint64_t BeginOffset = S.begin()->beginOffset();
 
-  for (AllocaSlices::iterator SI = S.begin(), SJ = llvm::next(SI), SE = S.end();
+  for (AllocaSlices::iterator SI = S.begin(), SJ = std::next(SI), SE = S.end();
        SI != SE; SI = SJ) {
     uint64_t MaxEndOffset = SI->endOffset();
 
@@ -3326,6 +3388,21 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) {
   return Changed;
 }
 
+/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+void SROA::clobberUse(Use &U) {
+  Value *OldV = U;
+  // Replace the use with an undef value.
+  U = UndefValue::get(OldV->getType());
+
+  // Check for this making an instruction dead. We have to garbage collect
+  // all the dead instructions to ensure the uses of any alloca end up being
+  // minimal.
+  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+    if (isInstructionTriviallyDead(OldI)) {
+      DeadInsts.insert(OldI);
+    }
+}
+
 /// \brief Analyze an alloca for SROA.
 ///
 /// This analyzes the alloca to ensure we can reason about it, builds
@@ -3363,21 +3440,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
   for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(),
                                         DE = S.dead_user_end();
        DI != DE; ++DI) {
-    Changed = true;
+    // Free up everything used by this instruction.
+    for (Use &DeadOp : (*DI)->operands())
+      clobberUse(DeadOp);
+
+    // Now replace the uses of this instruction.
     (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
+
+    // And mark it for deletion.
     DeadInsts.insert(*DI);
+    Changed = true;
   }
   for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(),
                                       DE = S.dead_op_end();
        DO != DE; ++DO) {
-    Value *OldV = **DO;
-    // Clobber the use with an undef value.
-    **DO = UndefValue::get(OldV->getType());
-    if (Instruction *OldI = dyn_cast<Instruction>(OldV))
-      if (isInstructionTriviallyDead(OldI)) {
-        Changed = true;
-        DeadInsts.insert(OldI);
-      }
+    clobberUse(**DO);
+    Changed = true;
   }
 
   // No slices to split. Leave the dead alloca for a later pass to clean up.
@@ -3413,10 +3491,10 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
 
     I->replaceAllUsesWith(UndefValue::get(I->getType()));
 
-    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
-      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+    for (Use &Operand : I->operands())
+      if (Instruction *U = dyn_cast<Instruction>(Operand)) {
         // Zero out the operand and see if it becomes trivially dead.
-        *OI = 0;
+        Operand = 0;
         if (isInstructionTriviallyDead(U))
           DeadInsts.insert(U);
       }
@@ -3432,10 +3510,9 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
 static void enqueueUsersInWorklist(Instruction &I,
                                    SmallVectorImpl<Instruction *> &Worklist,
                                    SmallPtrSet<Instruction *, 8> &Visited) {
-  for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
-       ++UI)
-    if (Visited.insert(cast<Instruction>(*UI)))
-      Worklist.push_back(cast<Instruction>(*UI));
+  for (User *U : I.users())
+    if (Visited.insert(cast<Instruction>(U)))
+      Worklist.push_back(cast<Instruction>(U));
 }
 
 /// \brief Promote the allocas, using the best available technique.
@@ -3521,32 +3598,24 @@ bool SROA::promoteAllocas(Function &F) {
   return true;
 }
 
-namespace {
-  /// \brief A predicate to test whether an alloca belongs to a set.
-  class IsAllocaInSet {
-    typedef SmallPtrSet<AllocaInst *, 4> SetType;
-    const SetType &Set;
-
-  public:
-    typedef AllocaInst *argument_type;
-
-    IsAllocaInSet(const SetType &Set) : Set(Set) {}
-    bool operator()(AllocaInst *AI) const { return Set.count(AI); }
-  };
-}
-
 bool SROA::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   C = &F.getContext();
-  DL = getAnalysisIfAvailable<DataLayout>();
-  if (!DL) {
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  if (!DLP) {
     DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
     return false;
   }
-  DT = getAnalysisIfAvailable<DominatorTree>();
+  DL = &DLP->getDataLayout();
+  DominatorTreeWrapperPass *DTWP =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : 0;
 
   BasicBlock &EntryBB = F.getEntryBlock();
-  for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end());
+  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
        I != E; ++I)
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
       Worklist.insert(AI);
@@ -3564,11 +3633,14 @@ bool SROA::runOnFunction(Function &F) {
       // Remove the deleted allocas from various lists so that we don't try to
       // continue processing them.
       if (!DeletedAllocas.empty()) {
-        Worklist.remove_if(IsAllocaInSet(DeletedAllocas));
-        PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas));
+        auto IsInSet = [&](AllocaInst *AI) {
+          return DeletedAllocas.count(AI);
+        };
+        Worklist.remove_if(IsInSet);
+        PostPromotionWorklist.remove_if(IsInSet);
         PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
                                                PromotableAllocas.end(),
-                                               IsAllocaInSet(DeletedAllocas)),
+                                               IsInSet),
                                 PromotableAllocas.end());
         DeletedAllocas.clear();
       }
@@ -3585,6 +3657,6 @@ bool SROA::runOnFunction(Function &F) {
 
 void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
   if (RequiresDomTree)
-    AU.addRequired<DominatorTree>();
+    AU.addRequired<DominatorTreeWrapperPass>();
   AU.setPreservesCFG();
 }
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
index 9bcd702..20d6daa 100644
--- a/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -24,26 +24,33 @@
 
 #define DEBUG_TYPE "sample-profile"
 
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include <cctype>
 
 using namespace llvm;
 
@@ -52,8 +59,161 @@ using namespace llvm;
 static cl::opt<std::string> SampleProfileFile(
     "sample-profile-file", cl::init(""), cl::value_desc("filename"),
     cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
+static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
+    "sample-profile-max-propagate-iterations", cl::init(100),
+    cl::desc("Maximum number of iterations to go through when propagating "
+             "sample block/edge weights through the CFG."));
 
 namespace {
+/// \brief Represents the relative location of an instruction.
+///
+/// Instruction locations are specified by the line offset from the
+/// beginning of the function (marked by the line where the function
+/// header is) and the discriminator value within that line.
+///
+/// The discriminator value is useful to distinguish instructions
+/// that are on the same line but belong to different basic blocks
+/// (e.g., the two post-increment instructions in "if (p) x++; else y++;").
+struct InstructionLocation {
+  InstructionLocation(int L, unsigned D) : LineOffset(L), Discriminator(D) {}
+  int LineOffset;
+  unsigned Discriminator;
+};
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<InstructionLocation> {
+  typedef DenseMapInfo<int> OffsetInfo;
+  typedef DenseMapInfo<unsigned> DiscriminatorInfo;
+  static inline InstructionLocation getEmptyKey() {
+    return InstructionLocation(OffsetInfo::getEmptyKey(),
+                               DiscriminatorInfo::getEmptyKey());
+  }
+  static inline InstructionLocation getTombstoneKey() {
+    return InstructionLocation(OffsetInfo::getTombstoneKey(),
+                               DiscriminatorInfo::getTombstoneKey());
+  }
+  static inline unsigned getHashValue(InstructionLocation Val) {
+    return DenseMapInfo<std::pair<int, unsigned>>::getHashValue(
+        std::pair<int, unsigned>(Val.LineOffset, Val.Discriminator));
+  }
+  static inline bool isEqual(InstructionLocation LHS, InstructionLocation RHS) {
+    return LHS.LineOffset == RHS.LineOffset &&
+           LHS.Discriminator == RHS.Discriminator;
+  }
+};
+}
+
+namespace {
+typedef DenseMap<InstructionLocation, unsigned> BodySampleMap;
+typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap;
+typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap;
+typedef std::pair<BasicBlock *, BasicBlock *> Edge;
+typedef DenseMap<Edge, unsigned> EdgeWeightMap;
+typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap;
+
+/// \brief Representation of the runtime profile for a function.
+///
+/// This data structure contains the runtime profile for a given
+/// function. It contains the total number of samples collected
+/// in the function and a map of samples collected in every statement.
+class SampleFunctionProfile {
+public:
+  SampleFunctionProfile()
+      : TotalSamples(0), TotalHeadSamples(0), HeaderLineno(0), DT(0), PDT(0),
+        LI(0), Ctx(0) {}
+
+  unsigned getFunctionLoc(Function &F);
+  bool emitAnnotations(Function &F, DominatorTree *DomTree,
+                       PostDominatorTree *PostDomTree, LoopInfo *Loops);
+  unsigned getInstWeight(Instruction &I);
+  unsigned getBlockWeight(BasicBlock *B);
+  void addTotalSamples(unsigned Num) { TotalSamples += Num; }
+  void addHeadSamples(unsigned Num) { TotalHeadSamples += Num; }
+  void addBodySamples(int LineOffset, unsigned Discriminator, unsigned Num) {
+    assert(LineOffset >= 0);
+    BodySamples[InstructionLocation(LineOffset, Discriminator)] += Num;
+  }
+  void print(raw_ostream &OS);
+  void printEdgeWeight(raw_ostream &OS, Edge E);
+  void printBlockWeight(raw_ostream &OS, BasicBlock *BB);
+  void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB);
+  bool computeBlockWeights(Function &F);
+  void findEquivalenceClasses(Function &F);
+  void findEquivalencesFor(BasicBlock *BB1,
+                           SmallVector<BasicBlock *, 8> Descendants,
+                           DominatorTreeBase<BasicBlock> *DomTree);
+  void propagateWeights(Function &F);
+  unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
+  void buildEdges(Function &F);
+  bool propagateThroughEdges(Function &F);
+  bool empty() { return BodySamples.empty(); }
+
+protected:
+  /// \brief Total number of samples collected inside this function.
+  ///
+  /// Samples are cumulative, they include all the samples collected
+  /// inside this function and all its inlined callees.
+  unsigned TotalSamples;
+
+  /// \brief Total number of samples collected at the head of the function.
+  /// FIXME: Use head samples to estimate a cold/hot attribute for the function.
+  unsigned TotalHeadSamples;
+
+  /// \brief Line number for the function header. Used to compute relative
+  /// line numbers from the absolute line LOCs found in instruction locations.
+  /// The relative line numbers are needed to address the samples from the
+  /// profile file.
+  unsigned HeaderLineno;
+
+  /// \brief Map line offsets to collected samples.
+  ///
+  /// Each entry in this map contains the number of samples
+  /// collected at the corresponding line offset. All line locations
+  /// are an offset from the start of the function.
+  BodySampleMap BodySamples;
+
+  /// \brief Map basic blocks to their computed weights.
+  ///
+  /// The weight of a basic block is defined to be the maximum
+  /// of all the instruction weights in that block.
+  BlockWeightMap BlockWeights;
+
+  /// \brief Map edges to their computed weights.
+  ///
+  /// Edge weights are computed by propagating basic block weights in
+  /// SampleProfile::propagateWeights.
+  EdgeWeightMap EdgeWeights;
+
+  /// \brief Set of visited blocks during propagation.
+  SmallPtrSet<BasicBlock *, 128> VisitedBlocks;
+
+  /// \brief Set of visited edges during propagation.
+  SmallSet<Edge, 128> VisitedEdges;
+
+  /// \brief Equivalence classes for block weights.
+  ///
+  /// Two blocks BB1 and BB2 are in the same equivalence class if they
+  /// dominate and post-dominate each other, and they are in the same loop
+  /// nest. When this happens, the two blocks are guaranteed to execute
+  /// the same number of times.
+  EquivalenceClassMap EquivalenceClass;
+
+  /// \brief Dominance, post-dominance and loop information.
+  DominatorTree *DT;
+  PostDominatorTree *PDT;
+  LoopInfo *LI;
+
+  /// \brief Predecessors for each basic block in the CFG.
+  BlockEdgeMap Predecessors;
+
+  /// \brief Successors for each basic block in the CFG.
+  BlockEdgeMap Successors;
+
+  /// \brief LLVM context holding the debug data we need.
+  LLVMContext *Ctx;
+};
+
 /// \brief Sample-based profile reader.
 ///
 /// Each profile contains sample counts for all the functions
@@ -77,61 +237,33 @@ namespace {
 ///   2. The samples collected at each line in F. To provide some
 ///      protection against source code shuffling, line numbers should
 ///      be relative to the start of the function.
-class SampleProfile {
+class SampleModuleProfile {
 public:
-  SampleProfile(StringRef F) : Profiles(0), Filename(F) {}
+  SampleModuleProfile(const Module &M, StringRef F)
+      : Profiles(0), Filename(F), M(M) {}
 
   void dump();
-  void loadText();
+  bool loadText();
   void loadNative() { llvm_unreachable("not implemented"); }
-  bool emitAnnotations(Function &F);
   void printFunctionProfile(raw_ostream &OS, StringRef FName);
   void dumpFunctionProfile(StringRef FName);
+  SampleFunctionProfile &getProfile(const Function &F) {
+    return Profiles[F.getName()];
+  }
 
-protected:
-  typedef DenseMap<uint32_t, uint32_t> BodySampleMap;
-  typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap;
-
-  /// \brief Representation of the runtime profile for a function.
-  ///
-  /// This data structure contains the runtime profile for a given
-  /// function. It contains the total number of samples collected
-  /// in the function and a map of samples collected in every statement.
-  struct FunctionProfile {
-    /// \brief Total number of samples collected inside this function.
-    ///
-    /// Samples are cumulative, they include all the samples collected
-    /// inside this function and all its inlined callees.
-    unsigned TotalSamples;
-
-    // \brief Total number of samples collected at the head of the function.
-    unsigned TotalHeadSamples;
-
-    /// \brief Map line offsets to collected samples.
-    ///
-    /// Each entry in this map contains the number of samples
-    /// collected at the corresponding line offset. All line locations
-    /// are an offset from the start of the function.
-    BodySampleMap BodySamples;
-
-    /// \brief Map basic blocks to their computed weights.
-    ///
-    /// The weight of a basic block is defined to be the maximum
-    /// of all the instruction weights in that block.
-    BlockWeightMap BlockWeights;
-  };
-
-  uint32_t getInstWeight(Instruction &I, unsigned FirstLineno,
-                         BodySampleMap &BodySamples);
-  uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
-                              BodySampleMap &BodySamples);
+  /// \brief Report a parse error message.
+  void reportParseError(int64_t LineNumber, Twine Msg) const {
+    DiagnosticInfoSampleProfile Diag(Filename.data(), LineNumber, Msg);
+    M.getContext().diagnose(Diag);
+  }
 
+protected:
   /// \brief Map every function to its associated profile.
   ///
   /// The profile of every function executed at runtime is collected
-  /// in the structure FunctionProfile. This maps function objects
+  /// in the structure SampleFunctionProfile. This maps function objects
   /// to their corresponding profiles.
-  StringMap<FunctionProfile> Profiles;
+  StringMap<SampleFunctionProfile> Profiles;
 
   /// \brief Path name to the file holding the profile data.
   ///
@@ -140,63 +272,10 @@ protected:
   /// version of the profile format to be used in constructing test
   /// cases and debugging.
   StringRef Filename;
-};
 
-/// \brief Loader class for text-based profiles.
-///
-/// This class defines a simple interface to read text files containing
-/// profiles. It keeps track of line number information and location of
-/// the file pointer. Users of this class are responsible for actually
-/// parsing the lines returned by the readLine function.
-///
-/// TODO - This does not really belong here. It is a generic text file
-/// reader. It should be moved to the Support library and made more general.
-class ExternalProfileTextLoader {
-public:
-  ExternalProfileTextLoader(StringRef F) : Filename(F) {
-    error_code EC;
-    EC = MemoryBuffer::getFile(Filename, Buffer);
-    if (EC)
-      report_fatal_error("Could not open profile file " + Filename + ": " +
-                         EC.message());
-    FP = Buffer->getBufferStart();
-    Lineno = 0;
-  }
-
-  /// \brief Read a line from the mapped file.
-  StringRef readLine() {
-    size_t Length = 0;
-    const char *start = FP;
-    while (FP != Buffer->getBufferEnd() && *FP != '\n') {
-      Length++;
-      FP++;
-    }
-    if (FP != Buffer->getBufferEnd())
-      FP++;
-    Lineno++;
-    return StringRef(start, Length);
-  }
-
-  /// \brief Return true, if we've reached EOF.
-  bool atEOF() const { return FP == Buffer->getBufferEnd(); }
-
-  /// \brief Report a parse error message and stop compilation.
-  void reportParseError(Twine Msg) const {
-    report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n");
-  }
-
-private:
-  /// \brief Memory buffer holding the text file.
-  OwningPtr<MemoryBuffer> Buffer;
-
-  /// \brief Current position into the memory buffer.
-  const char *FP;
-
-  /// \brief Current line number.
-  int64_t Lineno;
-
-  /// \brief Path name where to the profile file.
-  StringRef Filename;
+  /// \brief Module being compiled. Used mainly to access the current
+  /// LLVM context for diagnostics.
+  const Module &M;
 };
 
 /// \brief Sample profile pass.
@@ -210,148 +289,241 @@ public:
   static char ID;
 
   SampleProfileLoader(StringRef Name = SampleProfileFile)
-      : FunctionPass(ID), Profiler(0), Filename(Name) {
+      : FunctionPass(ID), Profiler(), Filename(Name), ProfileIsValid(false) {
     initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual bool doInitialization(Module &M);
+  bool doInitialization(Module &M) override;
 
   void dump() { Profiler->dump(); }
 
-  virtual const char *getPassName() const { return "Sample profile pass"; }
+  const char *getPassName() const override { return "Sample profile pass"; }
 
-  virtual bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<LoopInfo>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTree>();
   }
 
 protected:
   /// \brief Profile reader object.
-  OwningPtr<SampleProfile> Profiler;
+  std::unique_ptr<SampleModuleProfile> Profiler;
 
   /// \brief Name of the profile file to load.
   StringRef Filename;
+
+  /// \brief Flag indicating whether the profile input loaded succesfully.
+  bool ProfileIsValid;
 };
 }
 
-/// \brief Print the function profile for \p FName on stream \p OS.
+/// \brief Print this function profile on stream \p OS.
 ///
 /// \param OS Stream to emit the output to.
-/// \param FName Name of the function to print.
-void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) {
-  FunctionProfile FProfile = Profiles[FName];
-  OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", "
-     << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size()
+void SampleFunctionProfile::print(raw_ostream &OS) {
+  OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size()
      << " sampled lines\n";
-  for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(),
-                                     SE = FProfile.BodySamples.end();
+  for (BodySampleMap::const_iterator SI = BodySamples.begin(),
+                                     SE = BodySamples.end();
        SI != SE; ++SI)
-    OS << "\tline offset: " << SI->first
+    OS << "\tline offset: " << SI->first.LineOffset
+       << ", discriminator: " << SI->first.Discriminator
        << ", number of samples: " << SI->second << "\n";
   OS << "\n";
 }
 
+/// \brief Print the weight of edge \p E on stream \p OS.
+///
+/// \param OS  Stream to emit the output to.
+/// \param E  Edge to print.
+void SampleFunctionProfile::printEdgeWeight(raw_ostream &OS, Edge E) {
+  OS << "weight[" << E.first->getName() << "->" << E.second->getName()
+     << "]: " << EdgeWeights[E] << "\n";
+}
+
+/// \brief Print the equivalence class of block \p BB on stream \p OS.
+///
+/// \param OS  Stream to emit the output to.
+/// \param BB  Block to print.
+void SampleFunctionProfile::printBlockEquivalence(raw_ostream &OS,
+                                                  BasicBlock *BB) {
+  BasicBlock *Equiv = EquivalenceClass[BB];
+  OS << "equivalence[" << BB->getName()
+     << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
+}
+
+/// \brief Print the weight of block \p BB on stream \p OS.
+///
+/// \param OS  Stream to emit the output to.
+/// \param BB  Block to print.
+void SampleFunctionProfile::printBlockWeight(raw_ostream &OS, BasicBlock *BB) {
+  OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n";
+}
+
+/// \brief Print the function profile for \p FName on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param FName Name of the function to print.
+void SampleModuleProfile::printFunctionProfile(raw_ostream &OS,
+                                               StringRef FName) {
+  OS << "Function: " << FName << ":\n";
+  Profiles[FName].print(OS);
+}
+
 /// \brief Dump the function profile for \p FName.
 ///
 /// \param FName Name of the function to print.
-void SampleProfile::dumpFunctionProfile(StringRef FName) {
+void SampleModuleProfile::dumpFunctionProfile(StringRef FName) {
   printFunctionProfile(dbgs(), FName);
 }
 
 /// \brief Dump all the function profiles found.
-void SampleProfile::dump() {
-  for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(),
-                                                  E = Profiles.end();
+void SampleModuleProfile::dump() {
+  for (StringMap<SampleFunctionProfile>::const_iterator I = Profiles.begin(),
+                                                        E = Profiles.end();
        I != E; ++I)
     dumpFunctionProfile(I->getKey());
 }
 
 /// \brief Load samples from a text file.
 ///
-/// The file is divided in two segments:
-///
-/// Symbol table (represented with the string "symbol table")
-///    Number of symbols in the table
-///    symbol 1
-///    symbol 2
-///    ...
-///    symbol N
+/// The file contains a list of samples for every function executed at
+/// runtime. Each function profile has the following format:
 ///
-/// Function body profiles
-///    function1:total_samples:total_head_samples:number_of_locations
-///    location_offset_1: number_of_samples
-///    location_offset_2: number_of_samples
+///    function1:total_samples:total_head_samples
+///    offset1[.discriminator]: number_of_samples [fn1:num fn2:num ... ]
+///    offset2[.discriminator]: number_of_samples [fn3:num fn4:num ... ]
 ///    ...
-///    location_offset_N: number_of_samples
+///    offsetN[.discriminator]: number_of_samples [fn5:num fn6:num ... ]
 ///
 /// Function names must be mangled in order for the profile loader to
-/// match them in the current translation unit.
+/// match them in the current translation unit. The two numbers in the
+/// function header specify how many total samples were accumulated in
+/// the function (first number), and the total number of samples accumulated
+/// at the prologue of the function (second number). This head sample
+/// count provides an indicator of how frequent is the function invoked.
+///
+/// Each sampled line may contain several items. Some are optional
+/// (marked below):
+///
+/// a- Source line offset. This number represents the line number
+///    in the function where the sample was collected. The line number
+///    is always relative to the line where symbol of the function
+///    is defined. So, if the function has its header at line 280,
+///    the offset 13 is at line 293 in the file.
+///
+/// b- [OPTIONAL] Discriminator. This is used if the sampled program
+///    was compiled with DWARF discriminator support
+///    (http://wiki.dwarfstd.org/index.php?title=Path_Discriminators)
+///
+/// c- Number of samples. This is the number of samples collected by
+///    the profiler at this source location.
+///
+/// d- [OPTIONAL] Potential call targets and samples. If present, this
+///    line contains a call instruction. This models both direct and
+///    indirect calls. Each called target is listed together with the
+///    number of samples. For example,
+///
+///    130: 7  foo:3  bar:2  baz:7
+///
+///    The above means that at relative line offset 130 there is a
+///    call instruction that calls one of foo(), bar() and baz(). With
+///    baz() being the relatively more frequent call target.
+///
+///    FIXME: This is currently unhandled, but it has a lot of
+///           potential for aiding the inliner.
+///
 ///
 /// Since this is a flat profile, a function that shows up more than
 /// once gets all its samples aggregated across all its instances.
-/// TODO - flat profiles are too imprecise to provide good optimization
-/// opportunities. Convert them to context-sensitive profile.
+///
+/// FIXME: flat profiles are too imprecise to provide good optimization
+///        opportunities. Convert them to context-sensitive profile.
 ///
 /// This textual representation is useful to generate unit tests and
 /// for debugging purposes, but it should not be used to generate
 /// profiles for large programs, as the representation is extremely
 /// inefficient.
-void SampleProfile::loadText() {
-  ExternalProfileTextLoader Loader(Filename);
-
-  // Read the symbol table.
-  StringRef Line = Loader.readLine();
-  if (Line != "symbol table")
-    Loader.reportParseError("Expected 'symbol table', found " + Line);
-  int NumSymbols;
-  Line = Loader.readLine();
-  if (Line.getAsInteger(10, NumSymbols))
-    Loader.reportParseError("Expected a number, found " + Line);
-  for (int I = 0; I < NumSymbols; I++) {
-    StringRef FName = Loader.readLine();
-    FunctionProfile &FProfile = Profiles[FName];
-    FProfile.BodySamples.clear();
-    FProfile.TotalSamples = 0;
-    FProfile.TotalHeadSamples = 0;
+///
+/// \returns true if the file was loaded successfully, false otherwise.
+bool SampleModuleProfile::loadText() {
+  std::unique_ptr<MemoryBuffer> Buffer;
+  error_code EC = MemoryBuffer::getFile(Filename, Buffer);
+  if (EC) {
+    std::string Msg(EC.message());
+    M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg));
+    return false;
   }
+  line_iterator LineIt(*Buffer, '#');
 
   // Read the profile of each function. Since each function may be
   // mentioned more than once, and we are collecting flat profiles,
   // accumulate samples as we parse them.
-  Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$");
-  Regex LineSample("^([0-9]+): ([0-9]+)$");
-  while (!Loader.atEOF()) {
-    SmallVector<StringRef, 4> Matches;
-    Line = Loader.readLine();
-    if (!HeadRE.match(Line, &Matches))
-      Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " +
-                              Line);
-    assert(Matches.size() == 5);
+  Regex HeadRE("^([^0-9].*):([0-9]+):([0-9]+)$");
+  Regex LineSample("^([0-9]+)\\.?([0-9]+)?: ([0-9]+)(.*)$");
+  while (!LineIt.is_at_eof()) {
+    // Read the header of each function.
+    //
+    // Note that for function identifiers we are actually expecting
+    // mangled names, but we may not always get them. This happens when
+    // the compiler decides not to emit the function (e.g., it was inlined
+    // and removed). In this case, the binary will not have the linkage
+    // name for the function, so the profiler will emit the function's
+    // unmangled name, which may contain characters like ':' and '>' in its
+    // name (member functions, templates, etc).
+    //
+    // The only requirement we place on the identifier, then, is that it
+    // should not begin with a number.
+    SmallVector<StringRef, 3> Matches;
+    if (!HeadRE.match(*LineIt, &Matches)) {
+      reportParseError(LineIt.line_number(),
+                       "Expected 'mangled_name:NUM:NUM', found " + *LineIt);
+      return false;
+    }
+    assert(Matches.size() == 4);
     StringRef FName = Matches[1];
-    unsigned NumSamples, NumHeadSamples, NumSampledLines;
+    unsigned NumSamples, NumHeadSamples;
     Matches[2].getAsInteger(10, NumSamples);
     Matches[3].getAsInteger(10, NumHeadSamples);
-    Matches[4].getAsInteger(10, NumSampledLines);
-    FunctionProfile &FProfile = Profiles[FName];
-    FProfile.TotalSamples += NumSamples;
-    FProfile.TotalHeadSamples += NumHeadSamples;
-    BodySampleMap &SampleMap = FProfile.BodySamples;
-    unsigned I;
-    for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) {
-      Line = Loader.readLine();
-      if (!LineSample.match(Line, &Matches))
-        Loader.reportParseError("Expected 'NUM: NUM', found " + Line);
-      assert(Matches.size() == 3);
-      unsigned LineOffset, NumSamples;
+    Profiles[FName] = SampleFunctionProfile();
+    SampleFunctionProfile &FProfile = Profiles[FName];
+    FProfile.addTotalSamples(NumSamples);
+    FProfile.addHeadSamples(NumHeadSamples);
+    ++LineIt;
+
+    // Now read the body. The body of the function ends when we reach
+    // EOF or when we see the start of the next function.
+    while (!LineIt.is_at_eof() && isdigit((*LineIt)[0])) {
+      if (!LineSample.match(*LineIt, &Matches)) {
+        reportParseError(
+            LineIt.line_number(),
+            "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt);
+        return false;
+      }
+      assert(Matches.size() == 5);
+      unsigned LineOffset, NumSamples, Discriminator = 0;
       Matches[1].getAsInteger(10, LineOffset);
-      Matches[2].getAsInteger(10, NumSamples);
-      SampleMap[LineOffset] += NumSamples;
-    }
+      if (Matches[2] != "")
+        Matches[2].getAsInteger(10, Discriminator);
+      Matches[3].getAsInteger(10, NumSamples);
+
+      // FIXME: Handle called targets (in Matches[4]).
 
-    if (I < NumSampledLines)
-      Loader.reportParseError("Unexpected end of file");
+      // When dealing with instruction weights, we use the value
+      // zero to indicate the absence of a sample. If we read an
+      // actual zero from the profile file, return it as 1 to
+      // avoid the confusion later on.
+      if (NumSamples == 0)
+        NumSamples = 1;
+      FProfile.addBodySamples(LineOffset, Discriminator, NumSamples);
+      ++LineIt;
+    }
   }
+
+  return true;
 }
 
 /// \brief Get the weight for an instruction.
@@ -359,46 +531,49 @@ void SampleProfile::loadText() {
 /// The "weight" of an instruction \p Inst is the number of samples
 /// collected on that instruction at runtime. To retrieve it, we
 /// need to compute the line number of \p Inst relative to the start of its
-/// function. We use \p FirstLineno to compute the offset. We then
-/// look up the samples collected for \p Inst using \p BodySamples.
+/// function. We use HeaderLineno to compute the offset. We then
+/// look up the samples collected for \p Inst using BodySamples.
 ///
 /// \param Inst Instruction to query.
-/// \param FirstLineno Line number of the first instruction in the function.
-/// \param BodySamples Map of relative source line locations to samples.
 ///
 /// \returns The profiled weight of I.
-uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno,
-                                      BodySampleMap &BodySamples) {
-  unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1;
-  return BodySamples.lookup(LOffset);
+unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) {
+  DebugLoc DLoc = Inst.getDebugLoc();
+  unsigned Lineno = DLoc.getLine();
+  if (Lineno < HeaderLineno)
+    return 0;
+
+  DILocation DIL(DLoc.getAsMDNode(*Ctx));
+  int LOffset = Lineno - HeaderLineno;
+  unsigned Discriminator = DIL.getDiscriminator();
+  unsigned Weight =
+      BodySamples.lookup(InstructionLocation(LOffset, Discriminator));
+  DEBUG(dbgs() << "    " << Lineno << "." << Discriminator << ":" << Inst
+               << " (line offset: " << LOffset << "." << Discriminator
+               << " - weight: " << Weight << ")\n");
+  return Weight;
 }
 
 /// \brief Compute the weight of a basic block.
 ///
 /// The weight of basic block \p B is the maximum weight of all the
-/// instructions in B.
+/// instructions in B. The weight of \p B is computed and cached in
+/// the BlockWeights map.
 ///
 /// \param B The basic block to query.
-/// \param FirstLineno The line number for the first line in the
-///     function holding B.
-/// \param BodySamples The map containing all the samples collected in that
-///     function.
 ///
 /// \returns The computed weight of B.
-uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
-                                           BodySampleMap &BodySamples) {
+unsigned SampleFunctionProfile::getBlockWeight(BasicBlock *B) {
   // If we've computed B's weight before, return it.
-  Function *F = B->getParent();
-  FunctionProfile &FProfile = Profiles[F->getName()];
   std::pair<BlockWeightMap::iterator, bool> Entry =
-      FProfile.BlockWeights.insert(std::make_pair(B, 0));
+      BlockWeights.insert(std::make_pair(B, 0));
   if (!Entry.second)
     return Entry.first->second;
 
   // Otherwise, compute and cache B's weight.
-  uint32_t Weight = 0;
+  unsigned Weight = 0;
   for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
-    uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples);
+    unsigned InstWeight = getInstWeight(*I);
     if (InstWeight > Weight)
       Weight = InstWeight;
   }
@@ -406,31 +581,344 @@ uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
   return Weight;
 }
 
-/// \brief Generate branch weight metadata for all branches in \p F.
+/// \brief Compute and store the weights of every basic block.
 ///
-/// For every branch instruction B in \p F, we compute the weight of the
-/// target block for each of the edges out of B. This is the weight
-/// that we associate with that branch.
+/// This populates the BlockWeights map by computing
+/// the weights of every basic block in the CFG.
 ///
-/// TODO - This weight assignment will most likely be wrong if the
-/// target branch has more than two predecessors. This needs to be done
-/// using some form of flow propagation.
+/// \param F The function to query.
+bool SampleFunctionProfile::computeBlockWeights(Function &F) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Block weights\n");
+  for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) {
+    unsigned Weight = getBlockWeight(B);
+    Changed |= (Weight > 0);
+    DEBUG(printBlockWeight(dbgs(), B));
+  }
+
+  return Changed;
+}
+
+/// \brief Find equivalence classes for the given block.
 ///
-/// Once all the branch weights are computed, we emit the MD_prof
-/// metadata on B using the computed values.
+/// This finds all the blocks that are guaranteed to execute the same
+/// number of times as \p BB1. To do this, it traverses all the the
+/// descendants of \p BB1 in the dominator or post-dominator tree.
+///
+/// A block BB2 will be in the same equivalence class as \p BB1 if
+/// the following holds:
+///
+/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
+///    is a descendant of \p BB1 in the dominator tree, then BB2 should
+///    dominate BB1 in the post-dominator tree.
+///
+/// 2- Both BB2 and \p BB1 must be in the same loop.
+///
+/// For every block BB2 that meets those two requirements, we set BB2's
+/// equivalence class to \p BB1.
+///
+/// \param BB1  Block to check.
+/// \param Descendants  Descendants of \p BB1 in either the dom or pdom tree.
+/// \param DomTree  Opposite dominator tree. If \p Descendants is filled
+///                 with blocks from \p BB1's dominator tree, then
+///                 this is the post-dominator tree, and vice versa.
+void SampleFunctionProfile::findEquivalencesFor(
+    BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants,
+    DominatorTreeBase<BasicBlock> *DomTree) {
+  for (SmallVectorImpl<BasicBlock *>::iterator I = Descendants.begin(),
+                                               E = Descendants.end();
+       I != E; ++I) {
+    BasicBlock *BB2 = *I;
+    bool IsDomParent = DomTree->dominates(BB2, BB1);
+    bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
+    if (BB1 != BB2 && VisitedBlocks.insert(BB2) && IsDomParent &&
+        IsInSameLoop) {
+      EquivalenceClass[BB2] = BB1;
+
+      // If BB2 is heavier than BB1, make BB2 have the same weight
+      // as BB1.
+      //
+      // Note that we don't worry about the opposite situation here
+      // (when BB2 is lighter than BB1). We will deal with this
+      // during the propagation phase. Right now, we just want to
+      // make sure that BB1 has the largest weight of all the
+      // members of its equivalence set.
+      unsigned &BB1Weight = BlockWeights[BB1];
+      unsigned &BB2Weight = BlockWeights[BB2];
+      BB1Weight = std::max(BB1Weight, BB2Weight);
+    }
+  }
+}
+
+/// \brief Find equivalence classes.
+///
+/// Since samples may be missing from blocks, we can fill in the gaps by setting
+/// the weights of all the blocks in the same equivalence class to the same
+/// weight. To compute the concept of equivalence, we use dominance and loop
+/// information. Two blocks B1 and B2 are in the same equivalence class if B1
+/// dominates B2, B2 post-dominates B1 and both are in the same loop.
 ///
 /// \param F The function to query.
-bool SampleProfile::emitAnnotations(Function &F) {
+void SampleFunctionProfile::findEquivalenceClasses(Function &F) {
+  SmallVector<BasicBlock *, 8> DominatedBBs;
+  DEBUG(dbgs() << "\nBlock equivalence classes\n");
+  // Find equivalence sets based on dominance and post-dominance information.
+  for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) {
+    BasicBlock *BB1 = B;
+
+    // Compute BB1's equivalence class once.
+    if (EquivalenceClass.count(BB1)) {
+      DEBUG(printBlockEquivalence(dbgs(), BB1));
+      continue;
+    }
+
+    // By default, blocks are in their own equivalence class.
+    EquivalenceClass[BB1] = BB1;
+
+    // Traverse all the blocks dominated by BB1. We are looking for
+    // every basic block BB2 such that:
+    //
+    // 1- BB1 dominates BB2.
+    // 2- BB2 post-dominates BB1.
+    // 3- BB1 and BB2 are in the same loop nest.
+    //
+    // If all those conditions hold, it means that BB2 is executed
+    // as many times as BB1, so they are placed in the same equivalence
+    // class by making BB2's equivalence class be BB1.
+    DominatedBBs.clear();
+    DT->getDescendants(BB1, DominatedBBs);
+    findEquivalencesFor(BB1, DominatedBBs, PDT->DT);
+
+    // Repeat the same logic for all the blocks post-dominated by BB1.
+    // We are looking for every basic block BB2 such that:
+    //
+    // 1- BB1 post-dominates BB2.
+    // 2- BB2 dominates BB1.
+    // 3- BB1 and BB2 are in the same loop nest.
+    //
+    // If all those conditions hold, BB2's equivalence class is BB1.
+    DominatedBBs.clear();
+    PDT->getDescendants(BB1, DominatedBBs);
+    findEquivalencesFor(BB1, DominatedBBs, DT);
+
+    DEBUG(printBlockEquivalence(dbgs(), BB1));
+  }
+
+  // Assign weights to equivalence classes.
+  //
+  // All the basic blocks in the same equivalence class will execute
+  // the same number of times. Since we know that the head block in
+  // each equivalence class has the largest weight, assign that weight
+  // to all the blocks in that equivalence class.
+  DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n");
+  for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) {
+    BasicBlock *BB = B;
+    BasicBlock *EquivBB = EquivalenceClass[BB];
+    if (BB != EquivBB)
+      BlockWeights[BB] = BlockWeights[EquivBB];
+    DEBUG(printBlockWeight(dbgs(), BB));
+  }
+}
+
+/// \brief Visit the given edge to decide if it has a valid weight.
+///
+/// If \p E has not been visited before, we copy to \p UnknownEdge
+/// and increment the count of unknown edges.
+///
+/// \param E  Edge to visit.
+/// \param NumUnknownEdges  Current number of unknown edges.
+/// \param UnknownEdge  Set if E has not been visited before.
+///
+/// \returns E's weight, if known. Otherwise, return 0.
+unsigned SampleFunctionProfile::visitEdge(Edge E, unsigned *NumUnknownEdges,
+                                          Edge *UnknownEdge) {
+  if (!VisitedEdges.count(E)) {
+    (*NumUnknownEdges)++;
+    *UnknownEdge = E;
+    return 0;
+  }
+
+  return EdgeWeights[E];
+}
+
+/// \brief Propagate weights through incoming/outgoing edges.
+///
+/// If the weight of a basic block is known, and there is only one edge
+/// with an unknown weight, we can calculate the weight of that edge.
+///
+/// Similarly, if all the edges have a known count, we can calculate the
+/// count of the basic block, if needed.
+///
+/// \param F  Function to process.
+///
+/// \returns  True if new weights were assigned to edges or blocks.
+bool SampleFunctionProfile::propagateThroughEdges(Function &F) {
   bool Changed = false;
-  FunctionProfile &FProfile = Profiles[F.getName()];
-  unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine();
-  MDBuilder MDB(F.getContext());
+  DEBUG(dbgs() << "\nPropagation through edges\n");
+  for (Function::iterator BI = F.begin(), EI = F.end(); BI != EI; ++BI) {
+    BasicBlock *BB = BI;
+
+    // Visit all the predecessor and successor edges to determine
+    // which ones have a weight assigned already. Note that it doesn't
+    // matter that we only keep track of a single unknown edge. The
+    // only case we are interested in handling is when only a single
+    // edge is unknown (see setEdgeOrBlockWeight).
+    for (unsigned i = 0; i < 2; i++) {
+      unsigned TotalWeight = 0;
+      unsigned NumUnknownEdges = 0;
+      Edge UnknownEdge, SelfReferentialEdge;
+
+      if (i == 0) {
+        // First, visit all predecessor edges.
+        for (size_t I = 0; I < Predecessors[BB].size(); I++) {
+          Edge E = std::make_pair(Predecessors[BB][I], BB);
+          TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+          if (E.first == E.second)
+            SelfReferentialEdge = E;
+        }
+      } else {
+        // On the second round, visit all successor edges.
+        for (size_t I = 0; I < Successors[BB].size(); I++) {
+          Edge E = std::make_pair(BB, Successors[BB][I]);
+          TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+        }
+      }
+
+      // After visiting all the edges, there are three cases that we
+      // can handle immediately:
+      //
+      // - All the edge weights are known (i.e., NumUnknownEdges == 0).
+      //   In this case, we simply check that the sum of all the edges
+      //   is the same as BB's weight. If not, we change BB's weight
+      //   to match. Additionally, if BB had not been visited before,
+      //   we mark it visited.
+      //
+      // - Only one edge is unknown and BB has already been visited.
+      //   In this case, we can compute the weight of the edge by
+      //   subtracting the total block weight from all the known
+      //   edge weights. If the edges weight more than BB, then the
+      //   edge of the last remaining edge is set to zero.
+      //
+      // - There exists a self-referential edge and the weight of BB is
+      //   known. In this case, this edge can be based on BB's weight.
+      //   We add up all the other known edges and set the weight on
+      //   the self-referential edge as we did in the previous case.
+      //
+      // In any other case, we must continue iterating. Eventually,
+      // all edges will get a weight, or iteration will stop when
+      // it reaches SampleProfileMaxPropagateIterations.
+      if (NumUnknownEdges <= 1) {
+        unsigned &BBWeight = BlockWeights[BB];
+        if (NumUnknownEdges == 0) {
+          // If we already know the weight of all edges, the weight of the
+          // basic block can be computed. It should be no larger than the sum
+          // of all edge weights.
+          if (TotalWeight > BBWeight) {
+            BBWeight = TotalWeight;
+            Changed = true;
+            DEBUG(dbgs() << "All edge weights for " << BB->getName()
+                         << " known. Set weight for block: ";
+                  printBlockWeight(dbgs(), BB););
+          }
+          if (VisitedBlocks.insert(BB))
+            Changed = true;
+        } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) {
+          // If there is a single unknown edge and the block has been
+          // visited, then we can compute E's weight.
+          if (BBWeight >= TotalWeight)
+            EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
+          else
+            EdgeWeights[UnknownEdge] = 0;
+          VisitedEdges.insert(UnknownEdge);
+          Changed = true;
+          DEBUG(dbgs() << "Set weight for edge: ";
+                printEdgeWeight(dbgs(), UnknownEdge));
+        }
+      } else if (SelfReferentialEdge.first && VisitedBlocks.count(BB)) {
+        unsigned &BBWeight = BlockWeights[BB];
+        // We have a self-referential edge and the weight of BB is known.
+        if (BBWeight >= TotalWeight)
+          EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
+        else
+          EdgeWeights[SelfReferentialEdge] = 0;
+        VisitedEdges.insert(SelfReferentialEdge);
+        Changed = true;
+        DEBUG(dbgs() << "Set self-referential edge weight to: ";
+              printEdgeWeight(dbgs(), SelfReferentialEdge));
+      }
+    }
+  }
+
+  return Changed;
+}
+
+/// \brief Build in/out edge lists for each basic block in the CFG.
+///
+/// We are interested in unique edges. If a block B1 has multiple
+/// edges to another block B2, we only add a single B1->B2 edge.
+void SampleFunctionProfile::buildEdges(Function &F) {
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *B1 = I;
 
-  // Clear the block weights cache.
-  FProfile.BlockWeights.clear();
+    // Add predecessors for B1.
+    SmallPtrSet<BasicBlock *, 16> Visited;
+    if (!Predecessors[B1].empty())
+      llvm_unreachable("Found a stale predecessors list in a basic block.");
+    for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
+      BasicBlock *B2 = *PI;
+      if (Visited.insert(B2))
+        Predecessors[B1].push_back(B2);
+    }
+
+    // Add successors for B1.
+    Visited.clear();
+    if (!Successors[B1].empty())
+      llvm_unreachable("Found a stale successors list in a basic block.");
+    for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
+      BasicBlock *B2 = *SI;
+      if (Visited.insert(B2))
+        Successors[B1].push_back(B2);
+    }
+  }
+}
+
+/// \brief Propagate weights into edges
+///
+/// The following rules are applied to every block B in the CFG:
+///
+/// - If B has a single predecessor/successor, then the weight
+///   of that edge is the weight of the block.
+///
+/// - If all incoming or outgoing edges are known except one, and the
+///   weight of the block is already known, the weight of the unknown
+///   edge will be the weight of the block minus the sum of all the known
+///   edges. If the sum of all the known edges is larger than B's weight,
+///   we set the unknown edge weight to zero.
+///
+/// - If there is a self-referential edge, and the weight of the block is
+///   known, the weight for that edge is set to the weight of the block
+///   minus the weight of the other incoming edges to that block (if
+///   known).
+void SampleFunctionProfile::propagateWeights(Function &F) {
+  bool Changed = true;
+  unsigned i = 0;
+
+  // Before propagation starts, build, for each block, a list of
+  // unique predecessors and successors. This is necessary to handle
+  // identical edges in multiway branches. Since we visit all blocks and all
+  // edges of the CFG, it is cleaner to build these lists once at the start
+  // of the pass.
+  buildEdges(F);
 
-  // When we find a branch instruction: For each edge E out of the branch,
-  // the weight of E is the weight of the target block.
+  // Propagate until we converge or we go past the iteration limit.
+  while (Changed && i++ < SampleProfileMaxPropagateIterations) {
+    Changed = propagateThroughEdges(F);
+  }
+
+  // Generate MD_prof metadata for every branch instruction using the
+  // edge weights computed during propagation.
+  DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
+  MDBuilder MDB(F.getContext());
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
     BasicBlock *B = I;
     TerminatorInst *TI = B->getTerminator();
@@ -439,34 +927,155 @@ bool SampleProfile::emitAnnotations(Function &F) {
     if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
       continue;
 
-    SmallVector<uint32_t, 4> Weights;
-    unsigned NSuccs = TI->getNumSuccessors();
-    for (unsigned I = 0; I < NSuccs; ++I) {
+    DEBUG(dbgs() << "\nGetting weights for branch at line "
+                 << TI->getDebugLoc().getLine() << ".\n");
+    SmallVector<unsigned, 4> Weights;
+    bool AllWeightsZero = true;
+    for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
       BasicBlock *Succ = TI->getSuccessor(I);
-      uint32_t Weight =
-          computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples);
+      Edge E = std::make_pair(B, Succ);
+      unsigned Weight = EdgeWeights[E];
+      DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
       Weights.push_back(Weight);
+      if (Weight != 0)
+        AllWeightsZero = false;
     }
 
-    TI->setMetadata(llvm::LLVMContext::MD_prof,
-                    MDB.createBranchWeights(Weights));
-    Changed = true;
+    // Only set weights if there is at least one non-zero weight.
+    // In any other case, let the analyzer set weights.
+    if (!AllWeightsZero) {
+      DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
+      TI->setMetadata(llvm::LLVMContext::MD_prof,
+                      MDB.createBranchWeights(Weights));
+    } else {
+      DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
+    }
   }
+}
 
-  return Changed;
+/// \brief Get the line number for the function header.
+///
+/// This looks up function \p F in the current compilation unit and
+/// retrieves the line number where the function is defined. This is
+/// line 0 for all the samples read from the profile file. Every line
+/// number is relative to this line.
+///
+/// \param F  Function object to query.
+///
+/// \returns the line number where \p F is defined. If it returns 0,
+///          it means that there is no debug information available for \p F.
+unsigned SampleFunctionProfile::getFunctionLoc(Function &F) {
+  NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu");
+  if (CUNodes) {
+    for (unsigned I = 0, E1 = CUNodes->getNumOperands(); I != E1; ++I) {
+      DICompileUnit CU(CUNodes->getOperand(I));
+      DIArray Subprograms = CU.getSubprograms();
+      for (unsigned J = 0, E2 = Subprograms.getNumElements(); J != E2; ++J) {
+        DISubprogram Subprogram(Subprograms.getElement(J));
+        if (Subprogram.describes(&F))
+          return Subprogram.getLineNumber();
+      }
+    }
+  }
+
+  F.getContext().diagnose(DiagnosticInfoSampleProfile(
+      "No debug information found in function " + F.getName()));
+  return 0;
 }
 
-char SampleProfileLoader::ID = 0;
-INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader",
-                false, false)
+/// \brief Generate branch weight metadata for all branches in \p F.
+///
+/// Branch weights are computed out of instruction samples using a
+/// propagation heuristic. Propagation proceeds in 3 phases:
+///
+/// 1- Assignment of block weights. All the basic blocks in the function
+///    are initial assigned the same weight as their most frequently
+///    executed instruction.
+///
+/// 2- Creation of equivalence classes. Since samples may be missing from
+///    blocks, we can fill in the gaps by setting the weights of all the
+///    blocks in the same equivalence class to the same weight. To compute
+///    the concept of equivalence, we use dominance and loop information.
+///    Two blocks B1 and B2 are in the same equivalence class if B1
+///    dominates B2, B2 post-dominates B1 and both are in the same loop.
+///
+/// 3- Propagation of block weights into edges. This uses a simple
+///    propagation heuristic. The following rules are applied to every
+///    block B in the CFG:
+///
+///    - If B has a single predecessor/successor, then the weight
+///      of that edge is the weight of the block.
+///
+///    - If all the edges are known except one, and the weight of the
+///      block is already known, the weight of the unknown edge will
+///      be the weight of the block minus the sum of all the known
+///      edges. If the sum of all the known edges is larger than B's weight,
+///      we set the unknown edge weight to zero.
+///
+///    - If there is a self-referential edge, and the weight of the block is
+///      known, the weight for that edge is set to the weight of the block
+///      minus the weight of the other incoming edges to that block (if
+///      known).
+///
+/// Since this propagation is not guaranteed to finalize for every CFG, we
+/// only allow it to proceed for a limited number of iterations (controlled
+/// by -sample-profile-max-propagate-iterations).
+///
+/// FIXME: Try to replace this propagation heuristic with a scheme
+/// that is guaranteed to finalize. A work-list approach similar to
+/// the standard value propagation algorithm used by SSA-CCP might
+/// work here.
+///
+/// Once all the branch weights are computed, we emit the MD_prof
+/// metadata on B using the computed values for each of its branches.
+///
+/// \param F The function to query.
+///
+/// \returns true if \p F was modified. Returns false, otherwise.
+bool SampleFunctionProfile::emitAnnotations(Function &F, DominatorTree *DomTree,
+                                            PostDominatorTree *PostDomTree,
+                                            LoopInfo *Loops) {
+  bool Changed = false;
 
-bool SampleProfileLoader::runOnFunction(Function &F) {
-  return Profiler->emitAnnotations(F);
+  // Initialize invariants used during computation and propagation.
+  HeaderLineno = getFunctionLoc(F);
+  if (HeaderLineno == 0)
+    return false;
+
+  DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
+               << ": " << HeaderLineno << "\n");
+  DT = DomTree;
+  PDT = PostDomTree;
+  LI = Loops;
+  Ctx = &F.getParent()->getContext();
+
+  // Compute basic block weights.
+  Changed |= computeBlockWeights(F);
+
+  if (Changed) {
+    // Find equivalence classes.
+    findEquivalenceClasses(F);
+
+    // Propagate weights to all edges.
+    propagateWeights(F);
+  }
+
+  return Changed;
 }
 
+char SampleProfileLoader::ID = 0;
+INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile",
+                      "Sample Profile loader", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(AddDiscriminators)
+INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile",
+                    "Sample Profile loader", false, false)
+
 bool SampleProfileLoader::doInitialization(Module &M) {
-  Profiler.reset(new SampleProfile(Filename));
-  Profiler->loadText();
+  Profiler.reset(new SampleModuleProfile(M, Filename));
+  ProfileIsValid = Profiler->loadText();
   return true;
 }
 
@@ -477,3 +1086,15 @@ FunctionPass *llvm::createSampleProfileLoaderPass() {
 FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) {
   return new SampleProfileLoader(Name);
 }
+
+bool SampleProfileLoader::runOnFunction(Function &F) {
+  if (!ProfileIsValid)
+    return false;
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>();
+  LoopInfo *LI = &getAnalysis<LoopInfo>();
+  SampleFunctionProfile &FunctionProfile = Profiler->getProfile(F);
+  if (!FunctionProfile.empty())
+    return FunctionProfile.emitAnnotations(F, DT, PDT, LI);
+  return false;
+}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 857597e..e950eba 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -17,8 +17,8 @@
 #include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/Scalar.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/Verifier.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassManager.h"
 
@@ -29,11 +29,12 @@ using namespace llvm;
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
   initializeSampleProfileLoaderPass(Registry);
-  initializeCodeGenPreparePass(Registry);
+  initializeConstantHoistingPass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
   initializeDCEPass(Registry);
   initializeDeadInstEliminationPass(Registry);
+  initializeScalarizerPass(Registry);
   initializeDSEPass(Registry);
   initializeGVNPass(Registry);
   initializeEarlyCSEPass(Registry);
@@ -81,6 +82,10 @@ void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createDeadStoreEliminationPass());
 }
 
+void LLVMAddScalarizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarizerPass());
+}
+
 void LLVMAddGVNPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createGVNPass());
 }
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 57b290e..e7b5ab2 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -24,15 +24,17 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/DIBuilder.h"
-#include "llvm/DebugInfo.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -41,10 +43,8 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -80,14 +80,14 @@ namespace {
         ScalarLoadThreshold = SLT;
     }
 
-    bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
     bool performScalarRepl(Function &F);
     bool performPromotion(Function &F);
 
   private:
     bool HasDomTree;
-    DataLayout *TD;
+    const DataLayout *DL;
 
     /// DeadInsts - Keep track of instructions we have made dead, so that
     /// we can remove them after we are done working.
@@ -195,8 +195,8 @@ namespace {
 
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<DominatorTree>();
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.setPreservesCFG();
     }
   };
@@ -212,7 +212,7 @@ namespace {
 
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
     }
   };
@@ -224,7 +224,7 @@ char SROA_SSAUp::ID = 0;
 
 INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
                 "Scalar Replacement of Aggregates (DT)", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
                 "Scalar Replacement of Aggregates (DT)", false, false)
 
@@ -258,7 +258,7 @@ namespace {
 class ConvertToScalarInfo {
   /// AllocaSize - The size of the alloca being considered in bytes.
   unsigned AllocaSize;
-  const DataLayout &TD;
+  const DataLayout &DL;
   unsigned ScalarLoadThreshold;
 
   /// IsNotTrivial - This is set to true if there is some access to the object
@@ -301,9 +301,9 @@ class ConvertToScalarInfo {
   bool HadDynamicAccess;
 
 public:
-  explicit ConvertToScalarInfo(unsigned Size, const DataLayout &td,
+  explicit ConvertToScalarInfo(unsigned Size, const DataLayout &DL,
                                unsigned SLT)
-    : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false),
+    : AllocaSize(Size), DL(DL), ScalarLoadThreshold(SLT), IsNotTrivial(false),
     ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false),
     HadDynamicAccess(false) { }
 
@@ -364,7 +364,7 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
       return 0;
 
     if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
-        !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth))
+        !HadNonMemTransferAccess && !DL.fitsInLegalInteger(BitWidth))
       return 0;
     // Dynamic accesses on integers aren't yet supported.  They need us to shift
     // by a dynamic amount which could be difficult to work out as we might not
@@ -466,10 +466,10 @@ bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
 /// SawVec flag.
 bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
                                              Value* NonConstantIdx) {
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
+  for (User *U : V->users()) {
+    Instruction *UI = cast<Instruction>(U);
 
-    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
       // Don't break volatile loads.
       if (!LI->isSimple())
         return false;
@@ -481,7 +481,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
       continue;
     }
 
-    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
       // Storing the pointer, not into the value?
       if (SI->getOperand(0) == V || !SI->isSimple()) return false;
       // Don't touch MMX operations.
@@ -492,7 +492,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
       continue;
     }
 
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(UI)) {
       if (!onlyUsedByLifetimeMarkers(BCI))
         IsNotTrivial = true;  // Can't be mem2reg'd.
       if (!CanConvertToScalar(BCI, Offset, NonConstantIdx))
@@ -500,7 +500,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
       continue;
     }
 
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UI)) {
       // If this is a GEP with a variable indices, we can't handle it.
       PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
       if (!PtrTy)
@@ -520,7 +520,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
         HadDynamicAccess = true;
       } else
         GEPNonConstantIdx = NonConstantIdx;
-      uint64_t GEPOffset = TD.getIndexedOffset(PtrTy,
+      uint64_t GEPOffset = DL.getIndexedOffset(PtrTy,
                                                Indices);
       // See if all uses can be converted.
       if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx))
@@ -532,7 +532,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
 
     // If this is a constant sized memset of a constant value (e.g. 0) we can
     // handle it.
-    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(UI)) {
       // Store to dynamic index.
       if (NonConstantIdx)
         return false;
@@ -559,7 +559,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
 
     // If this is a memcpy or memmove into or out of the whole allocation, we
     // can handle it like a load or store of the scalar type.
-    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(UI)) {
       // Store to dynamic index.
       if (NonConstantIdx)
         return false;
@@ -572,7 +572,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
     }
 
     // If this is a lifetime intrinsic, we can handle it.
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(UI)) {
       if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
           II->getIntrinsicID() == Intrinsic::lifetime_end) {
         continue;
@@ -597,7 +597,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
                                               uint64_t Offset,
                                               Value* NonConstantIdx) {
   while (!Ptr->use_empty()) {
-    Instruction *User = cast<Instruction>(Ptr->use_back());
+    Instruction *User = cast<Instruction>(Ptr->user_back());
 
     if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
       ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx);
@@ -615,7 +615,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         GEPNonConstantIdx = Indices.pop_back_val();
       } else
         GEPNonConstantIdx = NonConstantIdx;
-      uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
+      uint64_t GEPOffset = DL.getIndexedOffset(GEP->getPointerOperandType(),
                                                Indices);
       ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx);
       GEP->eraseFromParent();
@@ -692,9 +692,9 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       // If the source and destination are both to the same alloca, then this is
       // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
       // as appropriate.
-      AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &TD, 0));
+      AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &DL, 0));
 
-      if (GetUnderlyingObject(MTI->getSource(), &TD, 0) != OrigAI) {
+      if (GetUnderlyingObject(MTI->getSource(), &DL, 0) != OrigAI) {
         // Dest must be OrigAI, change this to be a load from the original
         // pointer (bitcasted), then a store to our new alloca.
         assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
@@ -710,7 +710,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval");
         SrcVal->setAlignment(MTI->getAlignment());
         Builder.CreateStore(SrcVal, NewAI);
-      } else if (GetUnderlyingObject(MTI->getDest(), &TD, 0) != OrigAI) {
+      } else if (GetUnderlyingObject(MTI->getDest(), &DL, 0) != OrigAI) {
         // Src must be OrigAI, change this to be a load from NewAI then a store
         // through the original dest pointer (bitcasted).
         assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
@@ -770,15 +770,15 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
   // If the result alloca is a vector type, this is either an element
   // access or a bitcast to another vector type of the same size.
   if (VectorType *VTy = dyn_cast<VectorType>(FromType)) {
-    unsigned FromTypeSize = TD.getTypeAllocSize(FromType);
-    unsigned ToTypeSize = TD.getTypeAllocSize(ToType);
+    unsigned FromTypeSize = DL.getTypeAllocSize(FromType);
+    unsigned ToTypeSize = DL.getTypeAllocSize(ToType);
     if (FromTypeSize == ToTypeSize)
         return Builder.CreateBitCast(FromVal, ToType);
 
     // Otherwise it must be an element access.
     unsigned Elt = 0;
     if (Offset) {
-      unsigned EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
+      unsigned EltSize = DL.getTypeAllocSizeInBits(VTy->getElementType());
       Elt = Offset/EltSize;
       assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
     }
@@ -804,7 +804,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
   if (StructType *ST = dyn_cast<StructType>(ToType)) {
     assert(!NonConstantIdx &&
            "Dynamic indexing into struct types not supported");
-    const StructLayout &Layout = *TD.getStructLayout(ST);
+    const StructLayout &Layout = *DL.getStructLayout(ST);
     Value *Res = UndefValue::get(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
@@ -818,7 +818,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
   if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
     assert(!NonConstantIdx &&
            "Dynamic indexing into array types not supported");
-    uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
+    uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType());
     Value *Res = UndefValue::get(AT);
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
@@ -834,12 +834,12 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
   // If this is a big-endian system and the load is narrower than the
   // full alloca type, we need to do a shift to get the right bits.
   int ShAmt = 0;
-  if (TD.isBigEndian()) {
+  if (DL.isBigEndian()) {
     // On big-endian machines, the lowest bit is stored at the bit offset
     // from the pointer given by getTypeStoreSizeInBits.  This matters for
     // integers with a bitwidth that is not a multiple of 8.
-    ShAmt = TD.getTypeStoreSizeInBits(NTy) -
-            TD.getTypeStoreSizeInBits(ToType) - Offset;
+    ShAmt = DL.getTypeStoreSizeInBits(NTy) -
+            DL.getTypeStoreSizeInBits(ToType) - Offset;
   } else {
     ShAmt = Offset;
   }
@@ -855,7 +855,7 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
                                 ConstantInt::get(FromVal->getType(), -ShAmt));
 
   // Finally, unconditionally truncate the integer to the right width.
-  unsigned LIBitWidth = TD.getTypeSizeInBits(ToType);
+  unsigned LIBitWidth = DL.getTypeSizeInBits(ToType);
   if (LIBitWidth < NTy->getBitWidth())
     FromVal =
       Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
@@ -902,8 +902,8 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   LLVMContext &Context = Old->getContext();
 
   if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
-    uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy);
-    uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType());
+    uint64_t VecSize = DL.getTypeAllocSizeInBits(VTy);
+    uint64_t ValSize = DL.getTypeAllocSizeInBits(SV->getType());
 
     // Changing the whole vector with memset or with an access of a different
     // vector type?
@@ -914,7 +914,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
     Type *EltTy = VTy->getElementType();
     if (SV->getType() != EltTy)
       SV = Builder.CreateBitCast(SV, EltTy);
-    uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy);
+    uint64_t EltSize = DL.getTypeAllocSizeInBits(EltTy);
     unsigned Elt = Offset/EltSize;
     Value *Idx;
     if (NonConstantIdx) {
@@ -933,7 +933,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
     assert(!NonConstantIdx &&
            "Dynamic indexing into struct types not supported");
-    const StructLayout &Layout = *TD.getStructLayout(ST);
+    const StructLayout &Layout = *DL.getStructLayout(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old,
@@ -946,7 +946,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
     assert(!NonConstantIdx &&
            "Dynamic indexing into array types not supported");
-    uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
+    uint64_t EltSize = DL.getTypeAllocSizeInBits(AT->getElementType());
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder);
@@ -956,14 +956,14 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
 
   // If SV is a float, convert it to the appropriate integer type.
   // If it is a pointer, do the same.
-  unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType());
-  unsigned DestWidth = TD.getTypeSizeInBits(AllocaType);
-  unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType());
-  unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType);
+  unsigned SrcWidth = DL.getTypeSizeInBits(SV->getType());
+  unsigned DestWidth = DL.getTypeSizeInBits(AllocaType);
+  unsigned SrcStoreWidth = DL.getTypeStoreSizeInBits(SV->getType());
+  unsigned DestStoreWidth = DL.getTypeStoreSizeInBits(AllocaType);
   if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
     SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
   else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType()));
+    SV = Builder.CreatePtrToInt(SV, DL.getIntPtrType(SV->getType()));
 
   // Zero extend or truncate the value if needed.
   if (SV->getType() != AllocaType) {
@@ -982,7 +982,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   // If this is a big-endian system and the store is narrower than the
   // full alloca type, we need to do a shift to get the right bits.
   int ShAmt = 0;
-  if (TD.isBigEndian()) {
+  if (DL.isBigEndian()) {
     // On big-endian machines, the lowest bit is stored at the bit offset
     // from the pointer given by getTypeStoreSizeInBits.  This matters for
     // integers with a bitwidth that is not a multiple of 8.
@@ -1020,7 +1020,11 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
 
 
 bool SROA::runOnFunction(Function &F) {
-  TD = getAnalysisIfAvailable<DataLayout>();
+  if (skipOptnoneFunction(F))
+    return false;
+
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
 
   bool Changed = performPromotion(F);
 
@@ -1028,7 +1032,7 @@ bool SROA::runOnFunction(Function &F) {
   // theoretically needs to. It should be refactored in order to support
   // target-independent IR. Until this is done, just skip the actual
   // scalar-replacement portion of this pass.
-  if (!TD) return Changed;
+  if (!DL) return Changed;
 
   while (1) {
     bool LocalChange = performScalarRepl(F);
@@ -1056,11 +1060,10 @@ public:
     // Remember which alloca we're promoting (for isInstInList).
     this->AI = AI;
     if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) {
-      for (Value::use_iterator UI = DebugNode->use_begin(),
-             E = DebugNode->use_end(); UI != E; ++UI)
-        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+      for (User *U : DebugNode->users())
+        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
           DDIs.push_back(DDI);
-        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI))
+        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
           DVIs.push_back(DVI);
     }
 
@@ -1078,14 +1081,14 @@ public:
     }
   }
 
-  virtual bool isInstInList(Instruction *I,
-                            const SmallVectorImpl<Instruction*> &Insts) const {
+  bool isInstInList(Instruction *I,
+                    const SmallVectorImpl<Instruction*> &Insts) const override {
     if (LoadInst *LI = dyn_cast<LoadInst>(I))
       return LI->getOperand(0) == AI;
     return cast<StoreInst>(I)->getPointerOperand() == AI;
   }
 
-  virtual void updateDebugInfo(Instruction *Inst) const {
+  void updateDebugInfo(Instruction *Inst) const override {
     for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
@@ -1134,22 +1137,21 @@ public:
 ///
 /// We can do this to a select if its only uses are loads and if the operand to
 /// the select can be loaded unconditionally.
-static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) {
+static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *DL) {
   bool TDerefable = SI->getTrueValue()->isDereferenceablePointer();
   bool FDerefable = SI->getFalseValue()->isDereferenceablePointer();
 
-  for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
-       UI != UE; ++UI) {
-    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+  for (User *U : SI->users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
     if (LI == 0 || !LI->isSimple()) return false;
 
     // Both operands to the select need to be dereferencable, either absolutely
     // (e.g. allocas) or at this point because we can see other accesses to it.
     if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
-                                                    LI->getAlignment(), TD))
+                                                    LI->getAlignment(), DL))
       return false;
     if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI,
-                                                    LI->getAlignment(), TD))
+                                                    LI->getAlignment(), DL))
       return false;
   }
 
@@ -1172,16 +1174,15 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) {
 ///
 /// We can do this to a select if its only uses are loads and if the operand to
 /// the select can be loaded unconditionally.
-static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) {
+static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *DL) {
   // For now, we can only do this promotion if the load is in the same block as
   // the PHI, and if there are no stores between the phi and load.
   // TODO: Allow recursive phi users.
   // TODO: Allow stores.
   BasicBlock *BB = PN->getParent();
   unsigned MaxAlign = 0;
-  for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
-       UI != UE; ++UI) {
-    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+  for (User *U : PN->users()) {
+    LoadInst *LI = dyn_cast<LoadInst>(U);
     if (LI == 0 || !LI->isSimple()) return false;
 
     // For now we only allow loads in the same block as the PHI.  This is a
@@ -1222,7 +1223,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) {
     // If this pointer is always safe to load, or if we can prove that there is
     // already a load in the block, then we can move the load to the pred block.
     if (InVal->isDereferenceablePointer() ||
-        isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD))
+        isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, DL))
       continue;
 
     return false;
@@ -1236,13 +1237,10 @@ static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) {
 /// direct (non-volatile) loads and stores to it.  If the alloca is close but
 /// not quite there, this will transform the code to allow promotion.  As such,
 /// it is a non-pure predicate.
-static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) {
+static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *DL) {
   SetVector<Instruction*, SmallVector<Instruction*, 4>,
             SmallPtrSet<Instruction*, 4> > InstsToRewrite;
-
-  for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
-       UI != UE; ++UI) {
-    User *U = *UI;
+  for (User *U : AI->users()) {
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (!LI->isSimple())
         return false;
@@ -1265,12 +1263,12 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) {
 
         // This is very rare and we just scrambled the use list of AI, start
         // over completely.
-        return tryToMakeAllocaBePromotable(AI, TD);
+        return tryToMakeAllocaBePromotable(AI, DL);
       }
 
       // If it is safe to turn "load (select c, AI, ptr)" into a select of two
       // loads, then we can transform this by rewriting the select.
-      if (!isSafeSelectToSpeculate(SI, TD))
+      if (!isSafeSelectToSpeculate(SI, DL))
         return false;
 
       InstsToRewrite.insert(SI);
@@ -1285,7 +1283,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) {
 
       // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads
       // in the pred blocks, then we can transform this by rewriting the PHI.
-      if (!isSafePHIToSpeculate(PN, TD))
+      if (!isSafePHIToSpeculate(PN, DL))
         return false;
 
       InstsToRewrite.insert(PN);
@@ -1312,12 +1310,9 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) {
   for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) {
       // This could only be a bitcast used by nothing but lifetime intrinsics.
-      for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end();
-           I != E;) {
-        Use &U = I.getUse();
-        ++I;
-        cast<Instruction>(U.getUser())->eraseFromParent();
-      }
+      for (BitCastInst::user_iterator I = BCI->user_begin(), E = BCI->user_end();
+           I != E;)
+        cast<Instruction>(*I++)->eraseFromParent();
       BCI->eraseFromParent();
       continue;
     }
@@ -1326,7 +1321,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) {
       // Selects in InstsToRewrite only have load uses.  Rewrite each as two
       // loads with a new select.
       while (!SI->use_empty()) {
-        LoadInst *LI = cast<LoadInst>(SI->use_back());
+        LoadInst *LI = cast<LoadInst>(SI->user_back());
 
         IRBuilder<> Builder(LI);
         LoadInst *TrueLoad =
@@ -1367,13 +1362,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) {
 
     // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
     // matter which one we get and if any differ, it doesn't matter.
-    LoadInst *SomeLoad = cast<LoadInst>(PN->use_back());
+    LoadInst *SomeLoad = cast<LoadInst>(PN->user_back());
     MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
     unsigned Align = SomeLoad->getAlignment();
 
     // Rewrite all loads of the PN to use the new PHI.
     while (!PN->use_empty()) {
-      LoadInst *LI = cast<LoadInst>(PN->use_back());
+      LoadInst *LI = cast<LoadInst>(PN->user_back());
       LI->replaceAllUsesWith(NewPN);
       LI->eraseFromParent();
     }
@@ -1407,7 +1402,7 @@ bool SROA::performPromotion(Function &F) {
   std::vector<AllocaInst*> Allocas;
   DominatorTree *DT = 0;
   if (HasDomTree)
-    DT = &getAnalysis<DominatorTree>();
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
   DIBuilder DIB(*F.getParent());
@@ -1420,7 +1415,7 @@ bool SROA::performPromotion(Function &F) {
     // the entry node
     for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
       if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
-        if (tryToMakeAllocaBePromotable(AI, TD))
+        if (tryToMakeAllocaBePromotable(AI, DL))
           Allocas.push_back(AI);
 
     if (Allocas.empty()) break;
@@ -1433,9 +1428,8 @@ bool SROA::performPromotion(Function &F) {
         AllocaInst *AI = Allocas[i];
 
         // Build list of instructions to promote.
-        for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
-             UI != E; ++UI)
-          Insts.push_back(cast<Instruction>(*UI));
+        for (User *U : AI->users())
+          Insts.push_back(cast<Instruction>(U));
         AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts);
         Insts.clear();
       }
@@ -1496,7 +1490,7 @@ bool SROA::performScalarRepl(Function &F) {
     // transform the allocation instruction if it is an array allocation
     // (allocations OF arrays are ok though), and an allocation of a scalar
     // value cannot be decomposed at all.
-    uint64_t AllocaSize = TD->getTypeAllocSize(AI->getAllocatedType());
+    uint64_t AllocaSize = DL->getTypeAllocSize(AI->getAllocatedType());
 
     // Do not promote [0 x %struct].
     if (AllocaSize == 0) continue;
@@ -1520,7 +1514,7 @@ bool SROA::performScalarRepl(Function &F) {
     // that we can't just check based on the type: the alloca may be of an i32
     // but that has pointer arithmetic to set byte 3 of it or something.
     if (AllocaInst *NewAI = ConvertToScalarInfo(
-              (unsigned)AllocaSize, *TD, ScalarLoadThreshold).TryConvert(AI)) {
+              (unsigned)AllocaSize, *DL, ScalarLoadThreshold).TryConvert(AI)) {
       NewAI->takeName(AI);
       AI->eraseFromParent();
       ++NumConverted;
@@ -1598,8 +1592,8 @@ void SROA::DeleteDeadInstructions() {
 /// referenced by this instruction.
 void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
                                AllocaInfo &Info) {
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
+  for (Use &U : I->uses()) {
+    Instruction *User = cast<Instruction>(U.getUser());
 
     if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
       isSafeForScalarRepl(BC, Offset, Info);
@@ -1616,13 +1610,13 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
         return MarkUnsafe(Info, User);
 
       isSafeMemAccess(Offset, Length->getZExtValue(), 0,
-                      UI.getOperandNo() == 0, Info, MI,
+                      U.getOperandNo() == 0, Info, MI,
                       true /*AllowWholeAccess*/);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       if (!LI->isSimple())
         return MarkUnsafe(Info, User);
       Type *LIType = LI->getType();
-      isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
+      isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
 
@@ -1632,7 +1626,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
         return MarkUnsafe(Info, User);
 
       Type *SIType = SI->getOperand(0)->getType();
-      isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
+      isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
     } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
@@ -1665,39 +1659,39 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
     if (!Info.CheckedPHIs.insert(PN))
       return;
 
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
+  for (User *U : I->users()) {
+    Instruction *UI = cast<Instruction>(U);
 
-    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(UI)) {
       isSafePHISelectUseForScalarRepl(BC, Offset, Info);
-    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) {
       // Only allow "bitcast" GEPs for simplicity.  We could generalize this,
       // but would have to prove that we're staying inside of an element being
       // promoted.
       if (!GEPI->hasAllZeroIndices())
-        return MarkUnsafe(Info, User);
+        return MarkUnsafe(Info, UI);
       isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
       if (!LI->isSimple())
-        return MarkUnsafe(Info, User);
+        return MarkUnsafe(Info, UI);
       Type *LIType = LI->getType();
-      isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
+      isSafeMemAccess(Offset, DL->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
 
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
       if (!SI->isSimple() || SI->getOperand(0) == I)
-        return MarkUnsafe(Info, User);
+        return MarkUnsafe(Info, UI);
 
       Type *SIType = SI->getOperand(0)->getType();
-      isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
+      isSafeMemAccess(Offset, DL->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
-    } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
-      isSafePHISelectUseForScalarRepl(User, Offset, Info);
+    } else if (isa<PHINode>(UI) || isa<SelectInst>(UI)) {
+      isSafePHISelectUseForScalarRepl(UI, Offset, Info);
     } else {
-      return MarkUnsafe(Info, User);
+      return MarkUnsafe(Info, UI);
     }
     if (Info.isUnsafe) return;
   }
@@ -1731,12 +1725,12 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
   // Compute the offset due to this GEP and check if the alloca has a
   // component element at that offset.
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
-  // If this GEP is non constant then the last operand must have been a
+  // If this GEP is non-constant then the last operand must have been a
   // dynamic index into a vector.  Pop this now as it has no impact on the
   // constant part of the offset.
   if (NonConstant)
     Indices.pop_back();
-  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+  Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
   if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset,
                         NonConstantIdxSize))
     MarkUnsafe(Info, GEPI);
@@ -1795,7 +1789,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
                            bool AllowWholeAccess) {
   // Check if this is a load/store of the entire alloca.
   if (Offset == 0 && AllowWholeAccess &&
-      MemSize == TD->getTypeAllocSize(Info.AI->getAllocatedType())) {
+      MemSize == DL->getTypeAllocSize(Info.AI->getAllocatedType())) {
     // This can be safe for MemIntrinsics (where MemOpType is 0) and integer
     // loads/stores (which are essentially the same as the MemIntrinsics with
     // regard to copying padding between elements).  But, if an alloca is
@@ -1832,20 +1826,20 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
   Type *EltTy;
   uint64_t EltSize;
   if (StructType *ST = dyn_cast<StructType>(T)) {
-    const StructLayout *Layout = TD->getStructLayout(ST);
+    const StructLayout *Layout = DL->getStructLayout(ST);
     unsigned EltIdx = Layout->getElementContainingOffset(Offset);
     EltTy = ST->getContainedType(EltIdx);
-    EltSize = TD->getTypeAllocSize(EltTy);
+    EltSize = DL->getTypeAllocSize(EltTy);
     Offset -= Layout->getElementOffset(EltIdx);
   } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
     EltTy = AT->getElementType();
-    EltSize = TD->getTypeAllocSize(EltTy);
+    EltSize = DL->getTypeAllocSize(EltTy);
     if (Offset >= AT->getNumElements() * EltSize)
       return false;
     Offset %= EltSize;
   } else if (VectorType *VT = dyn_cast<VectorType>(T)) {
     EltTy = VT->getElementType();
-    EltSize = TD->getTypeAllocSize(EltTy);
+    EltSize = DL->getTypeAllocSize(EltTy);
     if (Offset >= VT->getNumElements() * EltSize)
       return false;
     Offset %= EltSize;
@@ -1867,8 +1861,8 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
 void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
                                 SmallVectorImpl<AllocaInst *> &NewElts) {
   for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) {
-    Use &TheUse = UI.getUse();
-    Instruction *User = cast<Instruction>(*UI++);
+    Use &TheUse = *UI++;
+    Instruction *User = cast<Instruction>(TheUse.getUser());
 
     if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
       RewriteBitCast(BC, AI, Offset, NewElts);
@@ -1884,7 +1878,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
       uint64_t MemSize = Length->getZExtValue();
       if (Offset == 0 &&
-          MemSize == TD->getTypeAllocSize(AI->getAllocatedType()))
+          MemSize == DL->getTypeAllocSize(AI->getAllocatedType()))
         RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts);
       // Otherwise the intrinsic can only touch a single element and the
       // address operand will be updated, so nothing else needs to be done.
@@ -1920,8 +1914,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
         LI->replaceAllUsesWith(Insert);
         DeadInsts.push_back(LI);
       } else if (LIType->isIntegerTy() &&
-                 TD->getTypeAllocSize(LIType) ==
-                 TD->getTypeAllocSize(AI->getAllocatedType())) {
+                 DL->getTypeAllocSize(LIType) ==
+                 DL->getTypeAllocSize(AI->getAllocatedType())) {
         // If this is a load of the entire alloca to an integer, rewrite it.
         RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
       }
@@ -1947,8 +1941,8 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
         }
         DeadInsts.push_back(SI);
       } else if (SIType->isIntegerTy() &&
-                 TD->getTypeAllocSize(SIType) ==
-                 TD->getTypeAllocSize(AI->getAllocatedType())) {
+                 DL->getTypeAllocSize(SIType) ==
+                 DL->getTypeAllocSize(AI->getAllocatedType())) {
         // If this is a store of the entire alloca from an integer, rewrite it.
         RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
       }
@@ -2010,7 +2004,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
                                     Type *&IdxTy) {
   uint64_t Idx = 0;
   if (StructType *ST = dyn_cast<StructType>(T)) {
-    const StructLayout *Layout = TD->getStructLayout(ST);
+    const StructLayout *Layout = DL->getStructLayout(ST);
     Idx = Layout->getElementContainingOffset(Offset);
     T = ST->getContainedType(Idx);
     Offset -= Layout->getElementOffset(Idx);
@@ -2018,7 +2012,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
     return Idx;
   } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
     T = AT->getElementType();
-    uint64_t EltSize = TD->getTypeAllocSize(T);
+    uint64_t EltSize = DL->getTypeAllocSize(T);
     Idx = Offset / EltSize;
     Offset -= Idx * EltSize;
     IdxTy = Type::getInt64Ty(T->getContext());
@@ -2026,7 +2020,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
   }
   VectorType *VT = cast<VectorType>(T);
   T = VT->getElementType();
-  uint64_t EltSize = TD->getTypeAllocSize(T);
+  uint64_t EltSize = DL->getTypeAllocSize(T);
   Idx = Offset / EltSize;
   Offset -= Idx * EltSize;
   IdxTy = Type::getInt64Ty(T->getContext());
@@ -2047,7 +2041,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   Value* NonConstantIdx = 0;
   if (!GEPI->hasAllConstantIndices())
     NonConstantIdx = Indices.pop_back_val();
-  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+  Offset += DL->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
 
   RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
 
@@ -2118,7 +2112,7 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
     V = Builder.CreateGEP(V, Builder.getInt64(NewOffset));
 
     IdxTy = NewElts[Idx]->getAllocatedType();
-    uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset;
+    uint64_t EltSize = DL->getTypeAllocSize(IdxTy) - NewOffset;
     if (EltSize > Size) {
       EltSize = Size;
       Size = 0;
@@ -2134,7 +2128,7 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
 
   for (; Idx != NewElts.size() && Size; ++Idx) {
     IdxTy = NewElts[Idx]->getAllocatedType();
-    uint64_t EltSize = TD->getTypeAllocSize(IdxTy);
+    uint64_t EltSize = DL->getTypeAllocSize(IdxTy);
     if (EltSize > Size) {
       EltSize = Size;
       Size = 0;
@@ -2226,10 +2220,10 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
       Type *OtherTy = OtherPtrTy->getElementType();
       if (StructType *ST = dyn_cast<StructType>(OtherTy)) {
-        EltOffset = TD->getStructLayout(ST)->getElementOffset(i);
+        EltOffset = DL->getStructLayout(ST)->getElementOffset(i);
       } else {
         Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
-        EltOffset = TD->getTypeAllocSize(EltTy)*i;
+        EltOffset = DL->getTypeAllocSize(EltTy)*i;
       }
 
       // The alignment of the other pointer is the guaranteed alignment of the
@@ -2270,7 +2264,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
           Type *ValTy = EltTy->getScalarType();
 
           // Construct an integer with the right value.
-          unsigned EltSize = TD->getTypeSizeInBits(ValTy);
+          unsigned EltSize = DL->getTypeSizeInBits(ValTy);
           APInt OneVal(EltSize, CI->getZExtValue());
           APInt TotalVal(OneVal);
           // Set each byte.
@@ -2300,7 +2294,7 @@ SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
       // this element.
     }
 
-    unsigned EltSize = TD->getTypeAllocSize(EltTy);
+    unsigned EltSize = DL->getTypeAllocSize(EltTy);
     if (!EltSize)
       continue;
 
@@ -2334,12 +2328,12 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   // and store the element value to the individual alloca.
   Value *SrcVal = SI->getOperand(0);
   Type *AllocaEltTy = AI->getAllocatedType();
-  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
+  uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy);
 
   IRBuilder<> Builder(SI);
 
   // Handle tail padding by extending the operand
-  if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
+  if (DL->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
     SrcVal = Builder.CreateZExt(SrcVal,
                             IntegerType::get(SI->getContext(), AllocaSizeBits));
 
@@ -2349,15 +2343,15 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   // There are two forms here: AI could be an array or struct.  Both cases
   // have different ways to compute the element offset.
   if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
-    const StructLayout *Layout = TD->getStructLayout(EltSTy);
+    const StructLayout *Layout = DL->getStructLayout(EltSTy);
 
     for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
       // Get the number of bits to shift SrcVal to get the value.
       Type *FieldTy = EltSTy->getElementType(i);
       uint64_t Shift = Layout->getElementOffsetInBits(i);
 
-      if (TD->isBigEndian())
-        Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy);
+      if (DL->isBigEndian())
+        Shift = AllocaSizeBits-Shift-DL->getTypeAllocSizeInBits(FieldTy);
 
       Value *EltVal = SrcVal;
       if (Shift) {
@@ -2366,7 +2360,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
       }
 
       // Truncate down to an integer of the right size.
-      uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
+      uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy);
 
       // Ignore zero sized fields like {}, they obviously contain no data.
       if (FieldSizeBits == 0) continue;
@@ -2391,12 +2385,12 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   } else {
     ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
     Type *ArrayEltTy = ATy->getElementType();
-    uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
-    uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy);
+    uint64_t ElementOffset = DL->getTypeAllocSizeInBits(ArrayEltTy);
+    uint64_t ElementSizeBits = DL->getTypeSizeInBits(ArrayEltTy);
 
     uint64_t Shift;
 
-    if (TD->isBigEndian())
+    if (DL->isBigEndian())
       Shift = AllocaSizeBits-ElementOffset;
     else
       Shift = 0;
@@ -2430,7 +2424,7 @@ SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
       }
       new StoreInst(EltVal, DestField, SI);
 
-      if (TD->isBigEndian())
+      if (DL->isBigEndian())
         Shift -= ElementOffset;
       else
         Shift += ElementOffset;
@@ -2448,7 +2442,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
   // Extract each element out of the NewElts according to its structure offset
   // and form the result value.
   Type *AllocaEltTy = AI->getAllocatedType();
-  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
+  uint64_t AllocaSizeBits = DL->getTypeAllocSizeInBits(AllocaEltTy);
 
   DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
                << '\n');
@@ -2458,10 +2452,10 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
   const StructLayout *Layout = 0;
   uint64_t ArrayEltBitOffset = 0;
   if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
-    Layout = TD->getStructLayout(EltSTy);
+    Layout = DL->getStructLayout(EltSTy);
   } else {
     Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
-    ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
+    ArrayEltBitOffset = DL->getTypeAllocSizeInBits(ArrayEltTy);
   }
 
   Value *ResultVal =
@@ -2473,7 +2467,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
     Value *SrcField = NewElts[i];
     Type *FieldTy =
       cast<PointerType>(SrcField->getType())->getElementType();
-    uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
+    uint64_t FieldSizeBits = DL->getTypeSizeInBits(FieldTy);
 
     // Ignore zero sized fields like {}, they obviously contain no data.
     if (FieldSizeBits == 0) continue;
@@ -2504,7 +2498,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
     else  // Array case.
       Shift = i*ArrayEltBitOffset;
 
-    if (TD->isBigEndian())
+    if (DL->isBigEndian())
       Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth();
 
     if (Shift) {
@@ -2521,7 +2515,7 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
   }
 
   // Handle tail padding by truncating the result
-  if (TD->getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
+  if (DL->getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
     ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
 
   LI->replaceAllUsesWith(ResultVal);
@@ -2531,15 +2525,15 @@ SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
 /// HasPadding - Return true if the specified type has any structure or
 /// alignment padding in between the elements that would be split apart
 /// by SROA; return false otherwise.
-static bool HasPadding(Type *Ty, const DataLayout &TD) {
+static bool HasPadding(Type *Ty, const DataLayout &DL) {
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Ty = ATy->getElementType();
-    return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
+    return DL.getTypeSizeInBits(Ty) != DL.getTypeAllocSizeInBits(Ty);
   }
 
   // SROA currently handles only Arrays and Structs.
   StructType *STy = cast<StructType>(Ty);
-  const StructLayout *SL = TD.getStructLayout(STy);
+  const StructLayout *SL = DL.getStructLayout(STy);
   unsigned PrevFieldBitOffset = 0;
   for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
     unsigned FieldBitOffset = SL->getElementOffsetInBits(i);
@@ -2548,7 +2542,7 @@ static bool HasPadding(Type *Ty, const DataLayout &TD) {
     // previous one.
     if (i) {
       unsigned PrevFieldEnd =
-        PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1));
+        PrevFieldBitOffset+DL.getTypeSizeInBits(STy->getElementType(i-1));
       if (PrevFieldEnd < FieldBitOffset)
         return true;
     }
@@ -2557,7 +2551,7 @@ static bool HasPadding(Type *Ty, const DataLayout &TD) {
   // Check for tail padding.
   if (unsigned EltCount = STy->getNumElements()) {
     unsigned PrevFieldEnd = PrevFieldBitOffset +
-      TD.getTypeSizeInBits(STy->getElementType(EltCount-1));
+      DL.getTypeSizeInBits(STy->getElementType(EltCount-1));
     if (PrevFieldEnd < SL->getSizeInBits())
       return true;
   }
@@ -2584,7 +2578,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
   // types, but may actually be used.  In these cases, we refuse to promote the
   // struct.
   if (Info.isMemCpySrc && Info.isMemCpyDst &&
-      HasPadding(AI->getAllocatedType(), *TD))
+      HasPadding(AI->getAllocatedType(), *DL))
     return false;
 
   // If the alloca never has an access to just *part* of it, but is accessed
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
new file mode 100644
index 0000000..006375c
--- /dev/null
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -0,0 +1,662 @@
+//===--- Scalarizer.cpp - Scalarize vector operations ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts vector operations into scalar operations, in order
+// to expose optimization opportunities on the individual scalar operations.
+// It is mainly intended for targets that do not have vector units, but it
+// may also be useful for revectorizing code to different vector widths.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scalarizer"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+namespace {
+// Used to store the scattered form of a vector.
+typedef SmallVector<Value *, 8> ValueVector;
+
+// Used to map a vector Value to its scattered form.  We use std::map
+// because we want iterators to persist across insertion and because the
+// values are relatively large.
+typedef std::map<Value *, ValueVector> ScatterMap;
+
+// Lists Instructions that have been replaced with scalar implementations,
+// along with a pointer to their scattered forms.
+typedef SmallVector<std::pair<Instruction *, ValueVector *>, 16> GatherList;
+
+// Provides a very limited vector-like interface for lazily accessing one
+// component of a scattered vector or vector pointer.
+class Scatterer {
+public:
+  Scatterer() {}
+
+  // Scatter V into Size components.  If new instructions are needed,
+  // insert them before BBI in BB.  If Cache is nonnull, use it to cache
+  // the results.
+  Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+            ValueVector *cachePtr = 0);
+
+  // Return component I, creating a new Value for it if necessary.
+  Value *operator[](unsigned I);
+
+  // Return the number of components.
+  unsigned size() const { return Size; }
+
+private:
+  BasicBlock *BB;
+  BasicBlock::iterator BBI;
+  Value *V;
+  ValueVector *CachePtr;
+  PointerType *PtrTy;
+  ValueVector Tmp;
+  unsigned Size;
+};
+
+// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp
+// called Name that compares X and Y in the same way as FCI.
+struct FCmpSplitter {
+  FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
+  }
+  FCmpInst &FCI;
+};
+
+// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp
+// called Name that compares X and Y in the same way as ICI.
+struct ICmpSplitter {
+  ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
+  }
+  ICmpInst &ICI;
+};
+
+// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create
+// a binary operator like BO called Name with operands X and Y.
+struct BinarySplitter {
+  BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+  Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+                    const Twine &Name) const {
+    return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
+  }
+  BinaryOperator &BO;
+};
+
+// Information about a load or store that we're scalarizing.
+struct VectorLayout {
+  VectorLayout() : VecTy(0), ElemTy(0), VecAlign(0), ElemSize(0) {}
+
+  // Return the alignment of element I.
+  uint64_t getElemAlign(unsigned I) {
+    return MinAlign(VecAlign, I * ElemSize);
+  }
+
+  // The type of the vector.
+  VectorType *VecTy;
+
+  // The type of each element.
+  Type *ElemTy;
+
+  // The alignment of the vector.
+  uint64_t VecAlign;
+
+  // The size of each element.
+  uint64_t ElemSize;
+};
+
+class Scalarizer : public FunctionPass,
+                   public InstVisitor<Scalarizer, bool> {
+public:
+  static char ID;
+
+  Scalarizer() :
+    FunctionPass(ID) {
+    initializeScalarizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  // InstVisitor methods.  They return true if the instruction was scalarized,
+  // false if nothing changed.
+  bool visitInstruction(Instruction &) { return false; }
+  bool visitSelectInst(SelectInst &SI);
+  bool visitICmpInst(ICmpInst &);
+  bool visitFCmpInst(FCmpInst &);
+  bool visitBinaryOperator(BinaryOperator &);
+  bool visitGetElementPtrInst(GetElementPtrInst &);
+  bool visitCastInst(CastInst &);
+  bool visitBitCastInst(BitCastInst &);
+  bool visitShuffleVectorInst(ShuffleVectorInst &);
+  bool visitPHINode(PHINode &);
+  bool visitLoadInst(LoadInst &);
+  bool visitStoreInst(StoreInst &);
+
+private:
+  Scatterer scatter(Instruction *, Value *);
+  void gather(Instruction *, const ValueVector &);
+  bool canTransferMetadata(unsigned Kind);
+  void transferMetadata(Instruction *, const ValueVector &);
+  bool getVectorLayout(Type *, unsigned, VectorLayout &);
+  bool finish();
+
+  template<typename T> bool splitBinary(Instruction &, const T &);
+
+  ScatterMap Scattered;
+  GatherList Gathered;
+  unsigned ParallelLoopAccessMDKind;
+  const DataLayout *DL;
+};
+
+char Scalarizer::ID = 0;
+} // end anonymous namespace
+
+// This is disabled by default because having separate loads and stores makes
+// it more likely that the -combiner-alias-analysis limits will be reached.
+static cl::opt<bool> ScalarizeLoadStore
+  ("scalarize-load-store", cl::Hidden, cl::init(false),
+   cl::desc("Allow the scalarizer pass to scalarize loads and store"));
+
+INITIALIZE_PASS(Scalarizer, "scalarizer", "Scalarize vector operations",
+                false, false)
+
+Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+                     ValueVector *cachePtr)
+  : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
+  Type *Ty = V->getType();
+  PtrTy = dyn_cast<PointerType>(Ty);
+  if (PtrTy)
+    Ty = PtrTy->getElementType();
+  Size = Ty->getVectorNumElements();
+  if (!CachePtr)
+    Tmp.resize(Size, 0);
+  else if (CachePtr->empty())
+    CachePtr->resize(Size, 0);
+  else
+    assert(Size == CachePtr->size() && "Inconsistent vector sizes");
+}
+
+// Return component I, creating a new Value for it if necessary.
+Value *Scatterer::operator[](unsigned I) {
+  ValueVector &CV = (CachePtr ? *CachePtr : Tmp);
+  // Try to reuse a previous value.
+  if (CV[I])
+    return CV[I];
+  IRBuilder<> Builder(BB, BBI);
+  if (PtrTy) {
+    if (!CV[0]) {
+      Type *Ty =
+        PointerType::get(PtrTy->getElementType()->getVectorElementType(),
+                         PtrTy->getAddressSpace());
+      CV[0] = Builder.CreateBitCast(V, Ty, V->getName() + ".i0");
+    }
+    if (I != 0)
+      CV[I] = Builder.CreateConstGEP1_32(CV[0], I,
+                                         V->getName() + ".i" + Twine(I));
+  } else {
+    // Search through a chain of InsertElementInsts looking for element I.
+    // Record other elements in the cache.  The new V is still suitable
+    // for all uncached indices.
+    for (;;) {
+      InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
+      if (!Insert)
+        break;
+      ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2));
+      if (!Idx)
+        break;
+      unsigned J = Idx->getZExtValue();
+      CV[J] = Insert->getOperand(1);
+      V = Insert->getOperand(0);
+      if (I == J)
+        return CV[J];
+    }
+    CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
+                                         V->getName() + ".i" + Twine(I));
+  }
+  return CV[I];
+}
+
+bool Scalarizer::doInitialization(Module &M) {
+  ParallelLoopAccessMDKind =
+    M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+  return false;
+}
+
+bool Scalarizer::runOnFunction(Function &F) {
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : 0;
+  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
+    BasicBlock *BB = BBI;
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+      Instruction *I = II;
+      bool Done = visit(I);
+      ++II;
+      if (Done && I->getType()->isVoidTy())
+        I->eraseFromParent();
+    }
+  }
+  return finish();
+}
+
+// Return a scattered form of V that can be accessed by Point.  V must be a
+// vector or a pointer to a vector.
+Scatterer Scalarizer::scatter(Instruction *Point, Value *V) {
+  if (Argument *VArg = dyn_cast<Argument>(V)) {
+    // Put the scattered form of arguments in the entry block,
+    // so that it can be used everywhere.
+    Function *F = VArg->getParent();
+    BasicBlock *BB = &F->getEntryBlock();
+    return Scatterer(BB, BB->begin(), V, &Scattered[V]);
+  }
+  if (Instruction *VOp = dyn_cast<Instruction>(V)) {
+    // Put the scattered form of an instruction directly after the
+    // instruction.
+    BasicBlock *BB = VOp->getParent();
+    return Scatterer(BB, std::next(BasicBlock::iterator(VOp)),
+                     V, &Scattered[V]);
+  }
+  // In the fallback case, just put the scattered before Point and
+  // keep the result local to Point.
+  return Scatterer(Point->getParent(), Point, V);
+}
+
+// Replace Op with the gathered form of the components in CV.  Defer the
+// deletion of Op and creation of the gathered form to the end of the pass,
+// so that we can avoid creating the gathered form if all uses of Op are
+// replaced with uses of CV.
+void Scalarizer::gather(Instruction *Op, const ValueVector &CV) {
+  // Since we're not deleting Op yet, stub out its operands, so that it
+  // doesn't make anything live unnecessarily.
+  for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I)
+    Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType()));
+
+  transferMetadata(Op, CV);
+
+  // If we already have a scattered form of Op (created from ExtractElements
+  // of Op itself), replace them with the new form.
+  ValueVector &SV = Scattered[Op];
+  if (!SV.empty()) {
+    for (unsigned I = 0, E = SV.size(); I != E; ++I) {
+      Instruction *Old = cast<Instruction>(SV[I]);
+      CV[I]->takeName(Old);
+      Old->replaceAllUsesWith(CV[I]);
+      Old->eraseFromParent();
+    }
+  }
+  SV = CV;
+  Gathered.push_back(GatherList::value_type(Op, &SV));
+}
+
+// Return true if it is safe to transfer the given metadata tag from
+// vector to scalar instructions.
+bool Scalarizer::canTransferMetadata(unsigned Tag) {
+  return (Tag == LLVMContext::MD_tbaa
+          || Tag == LLVMContext::MD_fpmath
+          || Tag == LLVMContext::MD_tbaa_struct
+          || Tag == LLVMContext::MD_invariant_load
+          || Tag == ParallelLoopAccessMDKind);
+}
+
+// Transfer metadata from Op to the instructions in CV if it is known
+// to be safe to do so.
+void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  Op->getAllMetadataOtherThanDebugLoc(MDs);
+  for (unsigned I = 0, E = CV.size(); I != E; ++I) {
+    if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
+      for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
+             MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI)
+        if (canTransferMetadata(MI->first))
+          New->setMetadata(MI->first, MI->second);
+      New->setDebugLoc(Op->getDebugLoc());
+    }
+  }
+}
+
+// Try to fill in Layout from Ty, returning true on success.  Alignment is
+// the alignment of the vector, or 0 if the ABI default should be used.
+bool Scalarizer::getVectorLayout(Type *Ty, unsigned Alignment,
+                                 VectorLayout &Layout) {
+  if (!DL)
+    return false;
+
+  // Make sure we're dealing with a vector.
+  Layout.VecTy = dyn_cast<VectorType>(Ty);
+  if (!Layout.VecTy)
+    return false;
+
+  // Check that we're dealing with full-byte elements.
+  Layout.ElemTy = Layout.VecTy->getElementType();
+  if (DL->getTypeSizeInBits(Layout.ElemTy) !=
+      DL->getTypeStoreSizeInBits(Layout.ElemTy))
+    return false;
+
+  if (Alignment)
+    Layout.VecAlign = Alignment;
+  else
+    Layout.VecAlign = DL->getABITypeAlignment(Layout.VecTy);
+  Layout.ElemSize = DL->getTypeStoreSize(Layout.ElemTy);
+  return true;
+}
+
+// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name)
+// to create an instruction like I with operands X and Y and name Name.
+template<typename Splitter>
+bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) {
+  VectorType *VT = dyn_cast<VectorType>(I.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(I.getParent(), &I);
+  Scatterer Op0 = scatter(&I, I.getOperand(0));
+  Scatterer Op1 = scatter(&I, I.getOperand(1));
+  assert(Op0.size() == NumElems && "Mismatched binary operation");
+  assert(Op1.size() == NumElems && "Mismatched binary operation");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned Elem = 0; Elem < NumElems; ++Elem)
+    Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem],
+                      I.getName() + ".i" + Twine(Elem));
+  gather(&I, Res);
+  return true;
+}
+
+bool Scalarizer::visitSelectInst(SelectInst &SI) {
+  VectorType *VT = dyn_cast<VectorType>(SI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(SI.getParent(), &SI);
+  Scatterer Op1 = scatter(&SI, SI.getOperand(1));
+  Scatterer Op2 = scatter(&SI, SI.getOperand(2));
+  assert(Op1.size() == NumElems && "Mismatched select");
+  assert(Op2.size() == NumElems && "Mismatched select");
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  if (SI.getOperand(0)->getType()->isVectorTy()) {
+    Scatterer Op0 = scatter(&SI, SI.getOperand(0));
+    assert(Op0.size() == NumElems && "Mismatched select");
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I],
+                                    SI.getName() + ".i" + Twine(I));
+  } else {
+    Value *Op0 = SI.getOperand(0);
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I],
+                                    SI.getName() + ".i" + Twine(I));
+  }
+  gather(&SI, Res);
+  return true;
+}
+
+bool Scalarizer::visitICmpInst(ICmpInst &ICI) {
+  return splitBinary(ICI, ICmpSplitter(ICI));
+}
+
+bool Scalarizer::visitFCmpInst(FCmpInst &FCI) {
+  return splitBinary(FCI, FCmpSplitter(FCI));
+}
+
+bool Scalarizer::visitBinaryOperator(BinaryOperator &BO) {
+  return splitBinary(BO, BinarySplitter(BO));
+}
+
+bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  VectorType *VT = dyn_cast<VectorType>(GEPI.getType());
+  if (!VT)
+    return false;
+
+  IRBuilder<> Builder(GEPI.getParent(), &GEPI);
+  unsigned NumElems = VT->getNumElements();
+  unsigned NumIndices = GEPI.getNumIndices();
+
+  Scatterer Base = scatter(&GEPI, GEPI.getOperand(0));
+
+  SmallVector<Scatterer, 8> Ops;
+  Ops.resize(NumIndices);
+  for (unsigned I = 0; I < NumIndices; ++I)
+    Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1));
+
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I) {
+    SmallVector<Value *, 8> Indices;
+    Indices.resize(NumIndices);
+    for (unsigned J = 0; J < NumIndices; ++J)
+      Indices[J] = Ops[J][I];
+    Res[I] = Builder.CreateGEP(Base[I], Indices,
+                               GEPI.getName() + ".i" + Twine(I));
+    if (GEPI.isInBounds())
+      if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I]))
+        NewGEPI->setIsInBounds();
+  }
+  gather(&GEPI, Res);
+  return true;
+}
+
+bool Scalarizer::visitCastInst(CastInst &CI) {
+  VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(CI.getParent(), &CI);
+  Scatterer Op0 = scatter(&CI, CI.getOperand(0));
+  assert(Op0.size() == NumElems && "Mismatched cast");
+  ValueVector Res;
+  Res.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(),
+                                CI.getName() + ".i" + Twine(I));
+  gather(&CI, Res);
+  return true;
+}
+
+bool Scalarizer::visitBitCastInst(BitCastInst &BCI) {
+  VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
+  VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
+  if (!DstVT || !SrcVT)
+    return false;
+
+  unsigned DstNumElems = DstVT->getNumElements();
+  unsigned SrcNumElems = SrcVT->getNumElements();
+  IRBuilder<> Builder(BCI.getParent(), &BCI);
+  Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
+  ValueVector Res;
+  Res.resize(DstNumElems);
+
+  if (DstNumElems == SrcNumElems) {
+    for (unsigned I = 0; I < DstNumElems; ++I)
+      Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(),
+                                     BCI.getName() + ".i" + Twine(I));
+  } else if (DstNumElems > SrcNumElems) {
+    // <M x t1> -> <N*M x t2>.  Convert each t1 to <N x t2> and copy the
+    // individual elements to the destination.
+    unsigned FanOut = DstNumElems / SrcNumElems;
+    Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut);
+    unsigned ResI = 0;
+    for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
+      Value *V = Op0[Op0I];
+      Instruction *VI;
+      // Look through any existing bitcasts before converting to <N x t2>.
+      // In the best case, the resulting conversion might be a no-op.
+      while ((VI = dyn_cast<Instruction>(V)) &&
+             VI->getOpcode() == Instruction::BitCast)
+        V = VI->getOperand(0);
+      V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast");
+      Scatterer Mid = scatter(&BCI, V);
+      for (unsigned MidI = 0; MidI < FanOut; ++MidI)
+        Res[ResI++] = Mid[MidI];
+    }
+  } else {
+    // <N*M x t1> -> <M x t2>.  Convert each group of <N x t1> into a t2.
+    unsigned FanIn = SrcNumElems / DstNumElems;
+    Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn);
+    unsigned Op0I = 0;
+    for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
+      Value *V = UndefValue::get(MidTy);
+      for (unsigned MidI = 0; MidI < FanIn; ++MidI)
+        V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
+                                        BCI.getName() + ".i" + Twine(ResI)
+                                        + ".upto" + Twine(MidI));
+      Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(),
+                                        BCI.getName() + ".i" + Twine(ResI));
+    }
+  }
+  gather(&BCI, Res);
+  return true;
+}
+
+bool Scalarizer::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  VectorType *VT = dyn_cast<VectorType>(SVI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
+  Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  for (unsigned I = 0; I < NumElems; ++I) {
+    int Selector = SVI.getMaskValue(I);
+    if (Selector < 0)
+      Res[I] = UndefValue::get(VT->getElementType());
+    else if (unsigned(Selector) < Op0.size())
+      Res[I] = Op0[Selector];
+    else
+      Res[I] = Op1[Selector - Op0.size()];
+  }
+  gather(&SVI, Res);
+  return true;
+}
+
+bool Scalarizer::visitPHINode(PHINode &PHI) {
+  VectorType *VT = dyn_cast<VectorType>(PHI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = VT->getNumElements();
+  IRBuilder<> Builder(PHI.getParent(), &PHI);
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  unsigned NumOps = PHI.getNumOperands();
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps,
+                               PHI.getName() + ".i" + Twine(I));
+
+  for (unsigned I = 0; I < NumOps; ++I) {
+    Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I));
+    BasicBlock *IncomingBlock = PHI.getIncomingBlock(I);
+    for (unsigned J = 0; J < NumElems; ++J)
+      cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock);
+  }
+  gather(&PHI, Res);
+  return true;
+}
+
+bool Scalarizer::visitLoadInst(LoadInst &LI) {
+  if (!ScalarizeLoadStore)
+    return false;
+  if (!LI.isSimple())
+    return false;
+
+  VectorLayout Layout;
+  if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout))
+    return false;
+
+  unsigned NumElems = Layout.VecTy->getNumElements();
+  IRBuilder<> Builder(LI.getParent(), &LI);
+  Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  for (unsigned I = 0; I < NumElems; ++I)
+    Res[I] = Builder.CreateAlignedLoad(Ptr[I], Layout.getElemAlign(I),
+                                       LI.getName() + ".i" + Twine(I));
+  gather(&LI, Res);
+  return true;
+}
+
+bool Scalarizer::visitStoreInst(StoreInst &SI) {
+  if (!ScalarizeLoadStore)
+    return false;
+  if (!SI.isSimple())
+    return false;
+
+  VectorLayout Layout;
+  Value *FullValue = SI.getValueOperand();
+  if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout))
+    return false;
+
+  unsigned NumElems = Layout.VecTy->getNumElements();
+  IRBuilder<> Builder(SI.getParent(), &SI);
+  Scatterer Ptr = scatter(&SI, SI.getPointerOperand());
+  Scatterer Val = scatter(&SI, FullValue);
+
+  ValueVector Stores;
+  Stores.resize(NumElems);
+  for (unsigned I = 0; I < NumElems; ++I) {
+    unsigned Align = Layout.getElemAlign(I);
+    Stores[I] = Builder.CreateAlignedStore(Val[I], Ptr[I], Align);
+  }
+  transferMetadata(&SI, Stores);
+  return true;
+}
+
+// Delete the instructions that we scalarized.  If a full vector result
+// is still needed, recreate it using InsertElements.
+bool Scalarizer::finish() {
+  if (Gathered.empty())
+    return false;
+  for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end();
+       GMI != GME; ++GMI) {
+    Instruction *Op = GMI->first;
+    ValueVector &CV = *GMI->second;
+    if (!Op->use_empty()) {
+      // The value is still needed, so recreate it using a series of
+      // InsertElements.
+      Type *Ty = Op->getType();
+      Value *Res = UndefValue::get(Ty);
+      BasicBlock *BB = Op->getParent();
+      unsigned Count = Ty->getVectorNumElements();
+      IRBuilder<> Builder(BB, Op);
+      if (isa<PHINode>(Op))
+        Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+      for (unsigned I = 0; I < Count; ++I)
+        Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
+                                          Op->getName() + ".upto" + Twine(I));
+      Res->takeName(Op);
+      Op->replaceAllUsesWith(Res);
+    }
+    Op->eraseFromParent();
+  }
+  Gathered.clear();
+  Scattered.clear();
+  return true;
+}
+
+FunctionPass *llvm::createScalarizerPass() {
+  return new Scalarizer();
+}
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 8371f6d..ceae5a7 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -28,13 +28,13 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -46,9 +46,9 @@ struct CFGSimplifyPass : public FunctionPass {
   CFGSimplifyPass() : FunctionPass(ID) {
     initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
   }
-  virtual bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfo>();
   }
 };
@@ -145,7 +145,7 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
-                                   const DataLayout *TD) {
+                                   const DataLayout *DL) {
   bool Changed = false;
   bool LocalChange = true;
   while (LocalChange) {
@@ -154,7 +154,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
     // Loop over all of the basic blocks and remove them if they are unneeded...
     //
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(BBIt++, TTI, TD)) {
+      if (SimplifyCFG(BBIt++, TTI, DL)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -168,11 +168,15 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 // simplify the CFG.
 //
 bool CFGSimplifyPass::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
-  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  const DataLayout *DL = DLP ? &DLP->getDataLayout() : 0;
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, TD);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, DL);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -186,7 +190,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, TD);
+    EverChanged = iterativelySimplifyCFG(F, TTI, DL);
     EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index d4595bb..4107374 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -16,12 +16,11 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -41,15 +40,15 @@ namespace {
       initializeSinkingPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       FunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AliasAnalysis>();
-      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfo>();
-      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addPreserved<LoopInfo>();
     }
   private:
@@ -63,7 +62,7 @@ namespace {
 char Sinking::ID = 0;
 INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false)
 
@@ -77,15 +76,14 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
   // This may leave a referencing dbg_value in the original block, before
   // the definition of the vreg.  Dwarf generator handles this although the
   // user might not get the right info at runtime.
-  for (Value::use_iterator I = Inst->use_begin(),
-       E = Inst->use_end(); I != E; ++I) {
+  for (Use &U : Inst->uses()) {
     // Determine the block of the use.
-    Instruction *UseInst = cast<Instruction>(*I);
+    Instruction *UseInst = cast<Instruction>(U.getUser());
     BasicBlock *UseBlock = UseInst->getParent();
     if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
       // PHI nodes use the operand in the predecessor block, not the block with
       // the PHI.
-      unsigned Num = PHINode::getIncomingValueNumForOperand(I.getOperandNo());
+      unsigned Num = PHINode::getIncomingValueNumForOperand(U.getOperandNo());
       UseBlock = PN->getIncomingBlock(Num);
     }
     // Check that it dominates.
@@ -96,7 +94,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
 }
 
 bool Sinking::runOnFunction(Function &F) {
-  DT = &getAnalysis<DominatorTree>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfo>();
   AA = &getAnalysis<AliasAnalysis>();
 
@@ -218,6 +216,13 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst,
 /// instruction out of its current block into a successor.
 bool Sinking::SinkInstruction(Instruction *Inst,
                               SmallPtrSet<Instruction *, 8> &Stores) {
+
+  // Don't sink static alloca instructions.  CodeGen assumes allocas outside the
+  // entry block are dynamically sized stack objects.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+    if (AI->isStaticAlloca())
+      return false;
+
   // Check if it's safe to move the instruction.
   if (!isSafeToMove(Inst, AA, Stores))
     return false;
@@ -259,9 +264,9 @@ bool Sinking::SinkInstruction(Instruction *Inst,
     return false;
 
   DEBUG(dbgs() << "Sink" << *Inst << " (";
-        WriteAsOperand(dbgs(), Inst->getParent(), false);
+        Inst->getParent()->printAsOperand(dbgs(), false);
         dbgs() << " -> ";
-        WriteAsOperand(dbgs(), SuccToSinkTo, false);
+        SuccToSinkTo->printAsOperand(dbgs(), false);
         dbgs() << ")\n");
 
   // Move the instruction.
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5045ff8..8fd2268 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -15,7 +15,7 @@
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/PatternMatch.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
@@ -235,18 +235,18 @@ public:
   }
 
   using Pass::doInitialization;
-  virtual bool doInitialization(Region *R, RGPassManager &RGM);
+  bool doInitialization(Region *R, RGPassManager &RGM) override;
 
-  virtual bool runOnRegion(Region *R, RGPassManager &RGM);
+  bool runOnRegion(Region *R, RGPassManager &RGM) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Structurize control flow";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequiredID(LowerSwitchID);
-    AU.addRequired<DominatorTree>();
-    AU.addPreserved<DominatorTree>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
     RegionPass::getAnalysisUsage(AU);
   }
 };
@@ -258,7 +258,7 @@ char StructurizeCFG::ID = 0;
 INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(RegionInfo)
 INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                     false, false)
@@ -277,9 +277,8 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
 
 /// \brief Build up the general order of nodes
 void StructurizeCFG::orderNodes() {
-  scc_iterator<Region *> I = scc_begin(ParentRegion),
-                         E = scc_end(ParentRegion);
-  for (Order.clear(); I != E; ++I) {
+  scc_iterator<Region *> I = scc_begin(ParentRegion);
+  for (Order.clear(); !I.isAtEnd(); ++I) {
     std::vector<RegionNode *> &Nodes = *I;
     Order.append(Nodes.begin(), Nodes.end());
   }
@@ -326,16 +325,10 @@ Value *StructurizeCFG::invert(Value *Condition) {
   if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
     // Third: Check all the users for an invert
     BasicBlock *Parent = Inst->getParent();
-    for (Value::use_iterator I = Condition->use_begin(),
-           E = Condition->use_end(); I != E; ++I) {
-
-      Instruction *User = dyn_cast<Instruction>(*I);
-      if (!User || User->getParent() != Parent)
-        continue;
-
-      if (match(*I, m_Not(m_Specific(Condition))))
-        return *I;
-    }
+    for (User *U : Condition->users())
+      if (Instruction *I = dyn_cast<Instruction>(U))
+        if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
+          return I;
 
     // Last option: Create a new instruction
     return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
@@ -830,25 +823,19 @@ void StructurizeCFG::createFlow() {
 /// no longer dominate all their uses. Not sure if this is really nessasary
 void StructurizeCFG::rebuildSSA() {
   SSAUpdater Updater;
-  for (Region::block_iterator I = ParentRegion->block_begin(),
-                              E = ParentRegion->block_end();
-       I != E; ++I) {
-
-    BasicBlock *BB = *I;
+  for (const auto &BB : ParentRegion->blocks())
     for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
          II != IE; ++II) {
 
       bool Initialized = false;
-      for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) {
-
-        Next = I->getNext();
-
-        Instruction *User = cast<Instruction>(I->getUser());
+      for (auto I = II->use_begin(), E = II->use_end(); I != E;) {
+        Use &U = *I++;
+        Instruction *User = cast<Instruction>(U.getUser());
         if (User->getParent() == BB) {
           continue;
 
         } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
-          if (UserPN->getIncomingBlock(*I) == BB)
+          if (UserPN->getIncomingBlock(U) == BB)
             continue;
         }
 
@@ -862,10 +849,9 @@ void StructurizeCFG::rebuildSSA() {
           Updater.AddAvailableValue(BB, II);
           Initialized = true;
         }
-        Updater.RewriteUseAfterInsertions(*I);
+        Updater.RewriteUseAfterInsertions(U);
       }
     }
-  }
 }
 
 /// \brief Run the transformation for each region found
@@ -876,7 +862,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   Func = R->getEntry()->getParent();
   ParentRegion = R;
 
-  DT = &getAnalysis<DominatorTree>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
   orderNodes();
   collectInfos();
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 9fb8ddc..6d02777 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -60,17 +60,17 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -89,9 +89,9 @@ namespace {
       initializeTailCallElimPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
 
-    virtual bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
   private:
     CallInst *FindTRECandidate(Instruction *I,
@@ -149,16 +149,16 @@ namespace {
 struct AllocaCaptureTracker : public CaptureTracker {
   AllocaCaptureTracker() : Captured(false) {}
 
-  void tooManyUses() LLVM_OVERRIDE { Captured = true; }
+  void tooManyUses() override { Captured = true; }
 
-  bool shouldExplore(Use *U) LLVM_OVERRIDE {
+  bool shouldExplore(const Use *U) override {
     Value *V = U->getUser();
     if (isa<CallInst>(V) || isa<InvokeInst>(V))
       UsesAlloca.insert(V);
     return true;
   }
 
-  bool captured(Use *U) LLVM_OVERRIDE {
+  bool captured(const Use *U) override {
     if (isa<ReturnInst>(U->getUser()))
       return false;
     Captured = true;
@@ -171,6 +171,9 @@ struct AllocaCaptureTracker : public CaptureTracker {
 } // end anonymous namespace
 
 bool TailCallElim::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
   // If this function is a varargs function, we won't be able to PHI the args
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg()) return false;
@@ -377,13 +380,13 @@ Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
     return 0;
 
   // The only user of this instruction we allow is a single return instruction.
-  if (!I->hasOneUse() || !isa<ReturnInst>(I->use_back()))
+  if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
     return 0;
 
   // Ok, now we have to check all of the other return instructions in this
   // function.  If they return non-constants or differing values, then we cannot
   // transform the function safely.
-  return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI);
+  return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI);
 }
 
 static Instruction *FirstNonDbg(BasicBlock::iterator I) {
@@ -426,7 +429,7 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   // lower the call to fabs into inline code.
   if (BB == &F->getEntryBlock() &&
       FirstNonDbg(BB->front()) == CI &&
-      FirstNonDbg(llvm::next(BB->begin())) == TI &&
+      FirstNonDbg(std::next(BB->begin())) == TI &&
       CI->getCalledFunction() &&
       !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
author	Stephen Hines <srhines@google.com>	2014-04-23 16:57:46 -0700
committer	Stephen Hines <srhines@google.com>	2014-04-24 15:53:16 -0700
commit	36b56886974eae4f9c5ebc96befd3e7bfe5de338 (patch)
tree	e6cfb69fbbd937f450eeb83bfb83b9da3b01275a /lib/Transforms/Scalar
parent	69a8640022b04415ae9fac62f8ab090601d8f889 (diff)
download	external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.zip external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.gz external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.bz2