305 files changed, 7066 insertions, 5924 deletions
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index cb1e1eb..2e3ec8b 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -10,6 +10,7 @@ add_llvm_library(LLVMAnalysis
   BranchProbabilityInfo.cpp
   CFGPrinter.cpp
   CaptureTracking.cpp
+  CodeMetrics.cpp
   ConstantFolding.cpp
   DIBuilder.cpp
   DbgInfoPrinter.cpp
diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp
new file mode 100644
index 0000000..6c93f78
--- /dev/null
+++ b/lib/Analysis/CodeMetrics.cpp
@@ -0,0 +1,176 @@
+//===- CodeMetrics.cpp - Code cost measurements ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements code cost measurement utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Function.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Target/TargetData.h"
+
+using namespace llvm;
+
+/// callIsSmall - If a call is likely to lower to a single target instruction,
+/// or is otherwise deemed small return true.
+/// TODO: Perhaps calls like memcpy, strcpy, etc?
+bool llvm::callIsSmall(const Function *F) {
+  if (!F) return false;
+
+  if (F->hasLocalLinkage()) return false;
+
+  if (!F->hasName()) return false;
+
+  StringRef Name = F->getName();
+
+  // These will all likely lower to a single selection DAG node.
+  if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
+      Name == "fabs" || Name == "fabsf" || Name == "fabsl" ||
+      Name == "sin" || Name == "sinf" || Name == "sinl" ||
+      Name == "cos" || Name == "cosf" || Name == "cosl" ||
+      Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl" )
+    return true;
+
+  // These are all likely to be optimized into something smaller.
+  if (Name == "pow" || Name == "powf" || Name == "powl" ||
+      Name == "exp2" || Name == "exp2l" || Name == "exp2f" ||
+      Name == "floor" || Name == "floorf" || Name == "ceil" ||
+      Name == "round" || Name == "ffs" || Name == "ffsl" ||
+      Name == "abs" || Name == "labs" || Name == "llabs")
+    return true;
+
+  return false;
+}
+
+/// analyzeBasicBlock - Fill in the current structure with information gleaned
+/// from the specified block.
+void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
+                                    const TargetData *TD) {
+  ++NumBlocks;
+  unsigned NumInstsBeforeThisBB = NumInsts;
+  for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
+       II != E; ++II) {
+    if (isa<PHINode>(II)) continue;           // PHI nodes don't count.
+
+    // Special handling for calls.
+    if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
+      if (const IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(II)) {
+        switch (IntrinsicI->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::dbg_declare:
+        case Intrinsic::dbg_value:
+        case Intrinsic::invariant_start:
+        case Intrinsic::invariant_end:
+        case Intrinsic::lifetime_start:
+        case Intrinsic::lifetime_end:
+        case Intrinsic::objectsize:
+        case Intrinsic::ptr_annotation:
+        case Intrinsic::var_annotation:
+          // These intrinsics don't count as size.
+          continue;
+        }
+      }
+
+      ImmutableCallSite CS(cast<Instruction>(II));
+
+      if (const Function *F = CS.getCalledFunction()) {
+        // If a function is both internal and has a single use, then it is
+        // extremely likely to get inlined in the future (it was probably
+        // exposed by an interleaved devirtualization pass).
+        if (!CS.isNoInline() && F->hasInternalLinkage() && F->hasOneUse())
+          ++NumInlineCandidates;
+
+        // If this call is to function itself, then the function is recursive.
+        // Inlining it into other functions is a bad idea, because this is
+        // basically just a form of loop peeling, and our metrics aren't useful
+        // for that case.
+        if (F == BB->getParent())
+          isRecursive = true;
+      }
+
+      if (!isa<IntrinsicInst>(II) && !callIsSmall(CS.getCalledFunction())) {
+        // Each argument to a call takes on average one instruction to set up.
+        NumInsts += CS.arg_size();
+
+        // We don't want inline asm to count as a call - that would prevent loop
+        // unrolling. The argument setup cost is still real, though.
+        if (!isa<InlineAsm>(CS.getCalledValue()))
+          ++NumCalls;
+      }
+    }
+
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (!AI->isStaticAlloca())
+        this->usesDynamicAlloca = true;
+    }
+
+    if (isa<ExtractElementInst>(II) || II->getType()->isVectorTy())
+      ++NumVectorInsts;
+
+    if (const CastInst *CI = dyn_cast<CastInst>(II)) {
+      // Noop casts, including ptr <-> int,  don't count.
+      if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) ||
+          isa<PtrToIntInst>(CI))
+        continue;
+      // trunc to a native type is free (assuming the target has compare and
+      // shift-right of the same width).
+      if (isa<TruncInst>(CI) && TD &&
+          TD->isLegalInteger(TD->getTypeSizeInBits(CI->getType())))
+        continue;
+      // Result of a cmp instruction is often extended (to be used by other
+      // cmp instructions, logical or return instructions). These are usually
+      // nop on most sane targets.
+      if (isa<CmpInst>(CI->getOperand(0)))
+        continue;
+    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(II)){
+      // If a GEP has all constant indices, it will probably be folded with
+      // a load/store.
+      if (GEPI->hasAllConstantIndices())
+        continue;
+    }
+
+    ++NumInsts;
+  }
+
+  if (isa<ReturnInst>(BB->getTerminator()))
+    ++NumRets;
+
+  // We never want to inline functions that contain an indirectbr.  This is
+  // incorrect because all the blockaddress's (in static global initializers
+  // for example) would be referring to the original function, and this indirect
+  // jump would jump from the inlined copy of the function into the original
+  // function which is extremely undefined behavior.
+  // FIXME: This logic isn't really right; we can safely inline functions
+  // with indirectbr's as long as no other function or global references the
+  // blockaddress of a block within the current function.  And as a QOI issue,
+  // if someone is using a blockaddress without an indirectbr, and that
+  // reference somehow ends up in another function or global, we probably
+  // don't want to inline this function.
+  if (isa<IndirectBrInst>(BB->getTerminator()))
+    containsIndirectBr = true;
+
+  // Remember NumInsts for this BB.
+  NumBBInsts[BB] = NumInsts - NumInstsBeforeThisBB;
+}
+
+void CodeMetrics::analyzeFunction(Function *F, const TargetData *TD) {
+  // If this function contains a call that "returns twice" (e.g., setjmp or
+  // _setjmp) and it isn't marked with "returns twice" itself, never inline it.
+  // This is a hack because we depend on the user marking their local variables
+  // as volatile if they are live across a setjmp call, and they probably
+  // won't do this in callers.
+  exposesReturnsTwice = F->callsFunctionThatReturnsTwice() &&
+    !F->hasFnAttr(Attribute::ReturnsTwice);
+
+  // Look at the size of the callee.
+  for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+    analyzeBasicBlock(&*BB, TD);
+}
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/Analysis/DebugInfo.cpp
index 585a087..e30c0a9 100644
--- a/lib/Analysis/DebugInfo.cpp
+++ b/lib/Analysis/DebugInfo.cpp
@@ -68,7 +68,7 @@ uint64_t DIDescriptor::getUInt64Field(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(DbgNode->getOperand(Elt)))
+    if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
       return CI->getZExtValue();
 
   return 0;
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index cad22f8..463584d 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -79,10 +79,39 @@ static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L,
   return false;
 }
 
+/// Return true if all loop headers that dominate this block are in simplified
+/// form.
+static bool isSimplifiedLoopNest(BasicBlock *BB, const DominatorTree *DT,
+                                 const LoopInfo *LI,
+                                 SmallPtrSet<Loop*,16> &SimpleLoopNests) {
+  Loop *NearestLoop = 0;
+  for (DomTreeNode *Rung = DT->getNode(BB);
+       Rung; Rung = Rung->getIDom()) {
+    BasicBlock *DomBB = Rung->getBlock();
+    Loop *DomLoop = LI->getLoopFor(DomBB);
+    if (DomLoop && DomLoop->getHeader() == DomBB) {
+      // If the domtree walk reaches a loop with no preheader, return false.
+      if (!DomLoop->isLoopSimplifyForm())
+        return false;
+      // If we have already checked this loop nest, stop checking.
+      if (SimpleLoopNests.count(DomLoop))
+        break;
+      // If we have not already checked this loop nest, remember the loop
+      // header nearest to BB. The nearest loop may not contain BB.
+      if (!NearestLoop)
+        NearestLoop = DomLoop;
+    }
+  }
+  if (NearestLoop)
+    SimpleLoopNests.insert(NearestLoop);
+  return true;
+}
+
 /// AddUsersIfInteresting - Inspect the specified instruction.  If it is a
 /// reducible SCEV, recursively add its users to the IVUsesByStride set and
 /// return true.  Otherwise, return false.
-bool IVUsers::AddUsersIfInteresting(Instruction *I) {
+bool IVUsers::AddUsersIfInteresting(Instruction *I,
+                                    SmallPtrSet<Loop*,16> &SimpleLoopNests) {
   // Add this IV user to the Processed set before returning false to ensure that
   // all IV users are members of the set. See IVUsers::isIVUserOrOperand.
   if (!Processed.insert(I))
@@ -117,6 +146,18 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
     if (isa<PHINode>(User) && Processed.count(User))
       continue;
 
+    // Only consider IVUsers that are dominated by simplified loop
+    // headers. Otherwise, SCEVExpander will crash.
+    BasicBlock *UseBB = User->getParent();
+    // A phi's use is live out of its predecessor block.
+    if (PHINode *PHI = dyn_cast<PHINode>(User)) {
+      unsigned OperandNo = UI.getOperandNo();
+      unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+      UseBB = PHI->getIncomingBlock(ValNo);
+    }
+    if (!isSimplifiedLoopNest(UseBB, DT, LI, SimpleLoopNests))
+      return false;
+
     // Descend recursively, but not into PHI nodes outside the current loop.
     // It's important to see the entire expression outside the loop to get
     // choices that depend on addressing mode use right, although we won't
@@ -126,12 +167,13 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
     bool AddUserToIVUsers = false;
     if (LI->getLoopFor(User->getParent()) != L) {
       if (isa<PHINode>(User) || Processed.count(User) ||
-          !AddUsersIfInteresting(User)) {
+          !AddUsersIfInteresting(User, SimpleLoopNests)) {
         DEBUG(dbgs() << "FOUND USER in other loop: " << *User << '\n'
                      << "   OF SCEV: " << *ISE << '\n');
         AddUserToIVUsers = true;
       }
-    } else if (Processed.count(User) || !AddUsersIfInteresting(User)) {
+    } else if (Processed.count(User)
+               || !AddUsersIfInteresting(User, SimpleLoopNests)) {
       DEBUG(dbgs() << "FOUND USER: " << *User << '\n'
                    << "   OF SCEV: " << *ISE << '\n');
       AddUserToIVUsers = true;
@@ -180,11 +222,16 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
   SE = &getAnalysis<ScalarEvolution>();
   TD = getAnalysisIfAvailable<TargetData>();
 
+  // SCEVExpander can only handle users that are dominated by simplified loop
+  // entries. Keep track of all loops that are only dominated by other simple
+  // loops so we don't traverse the domtree for each user.
+  SmallPtrSet<Loop*,16> SimpleLoopNests;
+
   // Find all uses of induction variables in this loop, and categorize
   // them by stride.  Start by finding all of the PHI nodes in the header for
   // this loop.  If they are induction variables, inspect their uses.
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
-    (void)AddUsersIfInteresting(I);
+    (void)AddUsersIfInteresting(I, SimpleLoopNests);
 
   return false;
 }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index b326ba7..dedbfeb 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -20,165 +20,27 @@
 
 using namespace llvm;
 
-/// callIsSmall - If a call is likely to lower to a single target instruction,
-/// or is otherwise deemed small return true.
-/// TODO: Perhaps calls like memcpy, strcpy, etc?
-bool llvm::callIsSmall(const Function *F) {
-  if (!F) return false;
-
-  if (F->hasLocalLinkage()) return false;
-
-  if (!F->hasName()) return false;
-
-  StringRef Name = F->getName();
-
-  // These will all likely lower to a single selection DAG node.
-  if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
-      Name == "fabs" || Name == "fabsf" || Name == "fabsl" ||
-      Name == "sin" || Name == "sinf" || Name == "sinl" ||
-      Name == "cos" || Name == "cosf" || Name == "cosl" ||
-      Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl" )
-    return true;
-
-  // These are all likely to be optimized into something smaller.
-  if (Name == "pow" || Name == "powf" || Name == "powl" ||
-      Name == "exp2" || Name == "exp2l" || Name == "exp2f" ||
-      Name == "floor" || Name == "floorf" || Name == "ceil" ||
-      Name == "round" || Name == "ffs" || Name == "ffsl" ||
-      Name == "abs" || Name == "labs" || Name == "llabs")
-    return true;
-
-  return false;
-}
-
-/// analyzeBasicBlock - Fill in the current structure with information gleaned
-/// from the specified block.
-void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
-                                    const TargetData *TD) {
-  ++NumBlocks;
-  unsigned NumInstsBeforeThisBB = NumInsts;
-  for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
-       II != E; ++II) {
-    if (isa<PHINode>(II)) continue;           // PHI nodes don't count.
-
-    // Special handling for calls.
-    if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
-      if (const IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(II)) {
-        switch (IntrinsicI->getIntrinsicID()) {
-        default: break;
-        case Intrinsic::dbg_declare:
-        case Intrinsic::dbg_value:
-        case Intrinsic::invariant_start:
-        case Intrinsic::invariant_end:
-        case Intrinsic::lifetime_start:
-        case Intrinsic::lifetime_end:
-        case Intrinsic::objectsize:
-        case Intrinsic::ptr_annotation:
-        case Intrinsic::var_annotation:
-          // These intrinsics don't count as size.
-          continue;
-        }
-      }
-
-      ImmutableCallSite CS(cast<Instruction>(II));
-
-      if (const Function *F = CS.getCalledFunction()) {
-        // If a function is both internal and has a single use, then it is
-        // extremely likely to get inlined in the future (it was probably
-        // exposed by an interleaved devirtualization pass).
-        if (!CS.isNoInline() && F->hasInternalLinkage() && F->hasOneUse())
-          ++NumInlineCandidates;
-
-        // If this call is to function itself, then the function is recursive.
-        // Inlining it into other functions is a bad idea, because this is
-        // basically just a form of loop peeling, and our metrics aren't useful
-        // for that case.
-        if (F == BB->getParent())
-          isRecursive = true;
-      }
-
-      if (!isa<IntrinsicInst>(II) && !callIsSmall(CS.getCalledFunction())) {
-        // Each argument to a call takes on average one instruction to set up.
-        NumInsts += CS.arg_size();
-
-        // We don't want inline asm to count as a call - that would prevent loop
-        // unrolling. The argument setup cost is still real, though.
-        if (!isa<InlineAsm>(CS.getCalledValue()))
-          ++NumCalls;
-      }
-    }
-
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
-      if (!AI->isStaticAlloca())
-        this->usesDynamicAlloca = true;
-    }
-
-    if (isa<ExtractElementInst>(II) || II->getType()->isVectorTy())
-      ++NumVectorInsts;
-
-    if (const CastInst *CI = dyn_cast<CastInst>(II)) {
-      // Noop casts, including ptr <-> int,  don't count.
-      if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) ||
-          isa<PtrToIntInst>(CI))
-        continue;
-      // trunc to a native type is free (assuming the target has compare and
-      // shift-right of the same width).
-      if (isa<TruncInst>(CI) && TD &&
-          TD->isLegalInteger(TD->getTypeSizeInBits(CI->getType())))
-        continue;
-      // Result of a cmp instruction is often extended (to be used by other
-      // cmp instructions, logical or return instructions). These are usually
-      // nop on most sane targets.
-      if (isa<CmpInst>(CI->getOperand(0)))
-        continue;
-    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(II)){
-      // If a GEP has all constant indices, it will probably be folded with
-      // a load/store.
-      if (GEPI->hasAllConstantIndices())
+unsigned InlineCostAnalyzer::FunctionInfo::countCodeReductionForConstant(
+    const CodeMetrics &Metrics, Value *V) {
+  unsigned Reduction = 0;
+  SmallVector<Value *, 4> Worklist;
+  Worklist.push_back(V);
+  do {
+    Value *V = Worklist.pop_back_val();
+    for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+      User *U = *UI;
+      if (isa<BranchInst>(U) || isa<SwitchInst>(U)) {
+        // We will be able to eliminate all but one of the successors.
+        const TerminatorInst &TI = cast<TerminatorInst>(*U);
+        const unsigned NumSucc = TI.getNumSuccessors();
+        unsigned Instrs = 0;
+        for (unsigned I = 0; I != NumSucc; ++I)
+          Instrs += Metrics.NumBBInsts.lookup(TI.getSuccessor(I));
+        // We don't know which blocks will be eliminated, so use the average size.
+        Reduction += InlineConstants::InstrCost*Instrs*(NumSucc-1)/NumSucc;
         continue;
-    }
-
-    ++NumInsts;
-  }
-
-  if (isa<ReturnInst>(BB->getTerminator()))
-    ++NumRets;
-
-  // We never want to inline functions that contain an indirectbr.  This is
-  // incorrect because all the blockaddress's (in static global initializers
-  // for example) would be referring to the original function, and this indirect
-  // jump would jump from the inlined copy of the function into the original
-  // function which is extremely undefined behavior.
-  // FIXME: This logic isn't really right; we can safely inline functions
-  // with indirectbr's as long as no other function or global references the
-  // blockaddress of a block within the current function.  And as a QOI issue,
-  // if someone is using a blockaddress without an indirectbr, and that
-  // reference somehow ends up in another function or global, we probably
-  // don't want to inline this function.
-  if (isa<IndirectBrInst>(BB->getTerminator()))
-    containsIndirectBr = true;
-
-  // Remember NumInsts for this BB.
-  NumBBInsts[BB] = NumInsts - NumInstsBeforeThisBB;
-}
+      }
 
-// CountCodeReductionForConstant - Figure out an approximation for how many
-// instructions will be constant folded if the specified value is constant.
-//
-unsigned CodeMetrics::CountCodeReductionForConstant(Value *V) {
-  unsigned Reduction = 0;
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
-    User *U = *UI;
-    if (isa<BranchInst>(U) || isa<SwitchInst>(U)) {
-      // We will be able to eliminate all but one of the successors.
-      const TerminatorInst &TI = cast<TerminatorInst>(*U);
-      const unsigned NumSucc = TI.getNumSuccessors();
-      unsigned Instrs = 0;
-      for (unsigned I = 0; I != NumSucc; ++I)
-        Instrs += NumBBInsts[TI.getSuccessor(I)];
-      // We don't know which blocks will be eliminated, so use the average size.
-      Reduction += InlineConstants::InstrCost*Instrs*(NumSucc-1)/NumSucc;
-    } else {
       // Figure out if this instruction will be removed due to simple constant
       // propagation.
       Instruction &Inst = cast<Instruction>(*U);
@@ -200,33 +62,186 @@ unsigned CodeMetrics::CountCodeReductionForConstant(Value *V) {
           AllOperandsConstant = false;
           break;
         }
+      if (!AllOperandsConstant)
+        continue;
 
-      if (AllOperandsConstant) {
-        // We will get to remove this instruction...
-        Reduction += InlineConstants::InstrCost;
+      // We will get to remove this instruction...
+      Reduction += InlineConstants::InstrCost;
 
-        // And any other instructions that use it which become constants
-        // themselves.
-        Reduction += CountCodeReductionForConstant(&Inst);
+      // And any other instructions that use it which become constants
+      // themselves.
+      Worklist.push_back(&Inst);
+    }
+  } while (!Worklist.empty());
+  return Reduction;
+}
+
+static unsigned countCodeReductionForAllocaICmp(const CodeMetrics &Metrics,
+                                                ICmpInst *ICI) {
+  unsigned Reduction = 0;
+
+  // Bail if this is comparing against a non-constant; there is nothing we can
+  // do there.
+  if (!isa<Constant>(ICI->getOperand(1)))
+    return Reduction;
+
+  // An icmp pred (alloca, C) becomes true if the predicate is true when
+  // equal and false otherwise.
+  bool Result = ICI->isTrueWhenEqual();
+
+  SmallVector<Instruction *, 4> Worklist;
+  Worklist.push_back(ICI);
+  do {
+    Instruction *U = Worklist.pop_back_val();
+    Reduction += InlineConstants::InstrCost;
+    for (Value::use_iterator UI = U->use_begin(), UE = U->use_end();
+         UI != UE; ++UI) {
+      Instruction *I = dyn_cast<Instruction>(*UI);
+      if (!I || I->mayHaveSideEffects()) continue;
+      if (I->getNumOperands() == 1)
+        Worklist.push_back(I);
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        // If BO produces the same value as U, then the other operand is
+        // irrelevant and we can put it into the Worklist to continue
+        // deleting dead instructions. If BO produces the same value as the
+        // other operand, we can delete BO but that's it.
+        if (Result == true) {
+          if (BO->getOpcode() == Instruction::Or)
+            Worklist.push_back(I);
+          if (BO->getOpcode() == Instruction::And)
+            Reduction += InlineConstants::InstrCost;
+        } else {
+          if (BO->getOpcode() == Instruction::Or ||
+              BO->getOpcode() == Instruction::Xor)
+            Reduction += InlineConstants::InstrCost;
+          if (BO->getOpcode() == Instruction::And)
+            Worklist.push_back(I);
+        }
+      }
+      if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+        BasicBlock *BB = BI->getSuccessor(Result ? 0 : 1);
+        if (BB->getSinglePredecessor())
+          Reduction
+            += InlineConstants::InstrCost * Metrics.NumBBInsts.lookup(BB);
       }
     }
-  }
+  } while (!Worklist.empty());
+
   return Reduction;
 }
 
-// CountCodeReductionForAlloca - Figure out an approximation of how much smaller
-// the function will be if it is inlined into a context where an argument
-// becomes an alloca.
-//
-unsigned CodeMetrics::CountCodeReductionForAlloca(Value *V) {
+/// \brief Compute the reduction possible for a given instruction if we are able
+/// to SROA an alloca.
+///
+/// The reduction for this instruction is added to the SROAReduction output
+/// parameter. Returns false if this instruction is expected to defeat SROA in
+/// general.
+static bool countCodeReductionForSROAInst(Instruction *I,
+                                          SmallVectorImpl<Value *> &Worklist,
+                                          unsigned &SROAReduction) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!LI->isSimple())
+      return false;
+    SROAReduction += InlineConstants::InstrCost;
+    return true;
+  }
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!SI->isSimple())
+      return false;
+    SROAReduction += InlineConstants::InstrCost;
+    return true;
+  }
+
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    // If the GEP has variable indices, we won't be able to do much with it.
+    if (!GEP->hasAllConstantIndices())
+      return false;
+    // A non-zero GEP will likely become a mask operation after SROA.
+    if (GEP->hasAllZeroIndices())
+      SROAReduction += InlineConstants::InstrCost;
+    Worklist.push_back(GEP);
+    return true;
+  }
+
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(I)) {
+    // Track pointer through bitcasts.
+    Worklist.push_back(BCI);
+    SROAReduction += InlineConstants::InstrCost;
+    return true;
+  }
+
+  // We just look for non-constant operands to ICmp instructions as those will
+  // defeat SROA. The actual reduction for these happens even without SROA.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
+    return isa<Constant>(ICI->getOperand(1));
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+    // SROA can handle a select of alloca iff all uses of the alloca are
+    // loads, and dereferenceable. We assume it's dereferenceable since
+    // we're told the input is an alloca.
+    for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
+         UI != UE; ++UI) {
+      LoadInst *LI = dyn_cast<LoadInst>(*UI);
+      if (LI == 0 || !LI->isSimple())
+        return false;
+    }
+    // We don't know whether we'll be deleting the rest of the chain of
+    // instructions from the SelectInst on, because we don't know whether
+    // the other side of the select is also an alloca or not.
+    return true;
+  }
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      // SROA can usually chew through these intrinsics.
+      SROAReduction += InlineConstants::InstrCost;
+      return true;
+    }
+  }
+
+  // If there is some other strange instruction, we're not going to be
+  // able to do much if we inline this.
+  return false;
+}
+
+unsigned InlineCostAnalyzer::FunctionInfo::countCodeReductionForAlloca(
+    const CodeMetrics &Metrics, Value *V) {
   if (!V->getType()->isPointerTy()) return 0;  // Not a pointer
   unsigned Reduction = 0;
+  unsigned SROAReduction = 0;
+  bool CanSROAAlloca = true;
 
-  // Looking at ICmpInsts will never abort the analysis and return zero, and
-  // analyzing them is expensive, so save them for last so that we don't do
-  // extra work that we end up throwing out.
-  SmallVector<ICmpInst *, 4> ICmpInsts;
+  SmallVector<Value *, 4> Worklist;
+  Worklist.push_back(V);
+  do {
+    Value *V = Worklist.pop_back_val();
+    for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+         UI != E; ++UI){
+      Instruction *I = cast<Instruction>(*UI);
+
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
+        Reduction += countCodeReductionForAllocaICmp(Metrics, ICI);
+
+      if (CanSROAAlloca)
+        CanSROAAlloca = countCodeReductionForSROAInst(I, Worklist,
+                                                      SROAReduction);
+    }
+  } while (!Worklist.empty());
+
+  return Reduction + (CanSROAAlloca ? SROAReduction : 0);
+}
 
+void InlineCostAnalyzer::FunctionInfo::countCodeReductionForPointerPair(
+    const CodeMetrics &Metrics, DenseMap<Value *, unsigned> &PointerArgs,
+    Value *V, unsigned ArgIdx) {
   SmallVector<Value *, 4> Worklist;
   Worklist.push_back(V);
   do {
@@ -234,126 +249,57 @@ unsigned CodeMetrics::CountCodeReductionForAlloca(Value *V) {
     for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
          UI != E; ++UI){
       Instruction *I = cast<Instruction>(*UI);
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        if (!LI->isSimple())
-          return 0;
-        Reduction += InlineConstants::InstrCost;
-      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        if (!SI->isSimple())
-          return 0;
-        Reduction += InlineConstants::InstrCost;
-      } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
         // If the GEP has variable indices, we won't be able to do much with it.
         if (!GEP->hasAllConstantIndices())
-          return 0;
-        // A non-zero GEP will likely become a mask operation after SROA.
-        if (GEP->hasAllZeroIndices())
-          Reduction += InlineConstants::InstrCost;
+          continue;
+        // Unless the GEP is in-bounds, some comparisons will be non-constant.
+        // Fortunately, the real-world cases where this occurs uses in-bounds
+        // GEPs, and so we restrict the optimization to them here.
+        if (!GEP->isInBounds())
+          continue;
+
+        // Constant indices just change the constant offset. Add the resulting
+        // value both to our worklist for this argument, and to the set of
+        // viable paired values with future arguments.
+        PointerArgs[GEP] = ArgIdx;
         Worklist.push_back(GEP);
-      } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(I)) {
-        // Track pointer through bitcasts.
-        Worklist.push_back(BCI);
-        Reduction += InlineConstants::InstrCost;
-      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-        // SROA can handle a select of alloca iff all uses of the alloca are
-        // loads, and dereferenceable. We assume it's dereferenceable since
-        // we're told the input is an alloca.
-        for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
-             UI != UE; ++UI) {
-          LoadInst *LI = dyn_cast<LoadInst>(*UI);
-          if (LI == 0 || !LI->isSimple()) return 0;
-        }
-        // We don't know whether we'll be deleting the rest of the chain of
-        // instructions from the SelectInst on, because we don't know whether
-        // the other side of the select is also an alloca or not.
         continue;
-      } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-        switch (II->getIntrinsicID()) {
-        default:
-          return 0;
-        case Intrinsic::memset:
-        case Intrinsic::memcpy:
-        case Intrinsic::memmove:
-        case Intrinsic::lifetime_start:
-        case Intrinsic::lifetime_end:
-          // SROA can usually chew through these intrinsics.
-          Reduction += InlineConstants::InstrCost;
-          break;
-        }
-      } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
-        if (!isa<Constant>(ICI->getOperand(1)))
-          return 0;
-        ICmpInsts.push_back(ICI);
-      } else {
-        // If there is some other strange instruction, we're not going to be
-        // able to do much if we inline this.
-        return 0;
       }
-    }
-  } while (!Worklist.empty());
-
-  while (!ICmpInsts.empty()) {
-    ICmpInst *ICI = ICmpInsts.pop_back_val();
 
-    // An icmp pred (alloca, C) becomes true if the predicate is true when
-    // equal and false otherwise.
-    bool Result = ICI->isTrueWhenEqual();
-
-    SmallVector<Instruction *, 4> Worklist;
-    Worklist.push_back(ICI);
-    do {
-      Instruction *U = Worklist.pop_back_val();
-      Reduction += InlineConstants::InstrCost;
-      for (Value::use_iterator UI = U->use_begin(), UE = U->use_end();
-           UI != UE; ++UI) {
-        Instruction *I = dyn_cast<Instruction>(*UI);
-        if (!I || I->mayHaveSideEffects()) continue;
-        if (I->getNumOperands() == 1)
-          Worklist.push_back(I);
-        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-          // If BO produces the same value as U, then the other operand is
-          // irrelevant and we can put it into the Worklist to continue
-          // deleting dead instructions. If BO produces the same value as the
-          // other operand, we can delete BO but that's it.
-          if (Result == true) {
-            if (BO->getOpcode() == Instruction::Or)
-              Worklist.push_back(I);
-            if (BO->getOpcode() == Instruction::And)
-              Reduction += InlineConstants::InstrCost;
-          } else {
-            if (BO->getOpcode() == Instruction::Or ||
-                BO->getOpcode() == Instruction::Xor)
-              Reduction += InlineConstants::InstrCost;
-            if (BO->getOpcode() == Instruction::And)
-              Worklist.push_back(I);
-          }
-        }
-        if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-          BasicBlock *BB = BI->getSuccessor(Result ? 0 : 1);
-          if (BB->getSinglePredecessor())
-            Reduction += InlineConstants::InstrCost * NumBBInsts[BB];
-        }
+      // Track pointer through casts. Even when the result is not a pointer, it
+      // remains a constant relative to constants derived from other constant
+      // pointers.
+      if (CastInst *CI = dyn_cast<CastInst>(I)) {
+        PointerArgs[CI] = ArgIdx;
+        Worklist.push_back(CI);
+        continue;
       }
-    } while (!Worklist.empty());
-  }
 
-  return Reduction;
-}
+      // There are two instructions which produce a strict constant value when
+      // applied to two related pointer values. Ignore everything else.
+      if (!isa<ICmpInst>(I) && I->getOpcode() != Instruction::Sub)
+        continue;
+      assert(I->getNumOperands() == 2);
+
+      // Ensure that the two operands are in our set of potentially paired
+      // pointers (or are derived from them).
+      Value *OtherArg = I->getOperand(0);
+      if (OtherArg == V)
+        OtherArg = I->getOperand(1);
+      DenseMap<Value *, unsigned>::const_iterator ArgIt
+        = PointerArgs.find(OtherArg);
+      if (ArgIt == PointerArgs.end())
+        continue;
+      std::pair<unsigned, unsigned> ArgPair(ArgIt->second, ArgIdx);
+      if (ArgPair.first > ArgPair.second)
+        std::swap(ArgPair.first, ArgPair.second);
 
-/// analyzeFunction - Fill in the current structure with information gleaned
-/// from the specified function.
-void CodeMetrics::analyzeFunction(Function *F, const TargetData *TD) {
-  // If this function contains a call that "returns twice" (e.g., setjmp or
-  // _setjmp) and it isn't marked with "returns twice" itself, never inline it.
-  // This is a hack because we depend on the user marking their local variables
-  // as volatile if they are live across a setjmp call, and they probably
-  // won't do this in callers.
-  exposesReturnsTwice = F->callsFunctionThatReturnsTwice() &&
-    !F->hasFnAttr(Attribute::ReturnsTwice);
-
-  // Look at the size of the callee.
-  for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-    analyzeBasicBlock(&*BB, TD);
+      PointerArgPairWeights[ArgPair]
+        += countCodeReductionForConstant(Metrics, I);
+    }
+  } while (!Worklist.empty());
 }
 
 /// analyzeFunction - Fill in the current structure with information gleaned
@@ -368,12 +314,25 @@ void InlineCostAnalyzer::FunctionInfo::analyzeFunction(Function *F,
   if (Metrics.NumRets==1)
     --Metrics.NumInsts;
 
-  // Check out all of the arguments to the function, figuring out how much
-  // code can be eliminated if one of the arguments is a constant.
   ArgumentWeights.reserve(F->arg_size());
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
-    ArgumentWeights.push_back(ArgInfo(Metrics.CountCodeReductionForConstant(I),
-                                      Metrics.CountCodeReductionForAlloca(I)));
+  DenseMap<Value *, unsigned> PointerArgs;
+  unsigned ArgIdx = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgIdx) {
+    // Count how much code can be eliminated if one of the arguments is
+    // a constant or an alloca.
+    ArgumentWeights.push_back(ArgInfo(countCodeReductionForConstant(Metrics, I),
+                                      countCodeReductionForAlloca(Metrics, I)));
+
+    // If the argument is a pointer, also check for pairs of pointers where
+    // knowing a fixed offset between them allows simplification. This pattern
+    // arises mostly due to STL algorithm patterns where pointers are used as
+    // random access iterators.
+    if (!I->getType()->isPointerTy())
+      continue;
+    PointerArgs[I] = ArgIdx;
+    countCodeReductionForPointerPair(Metrics, PointerArgs, I, ArgIdx);
+  }
 }
 
 /// NeverInline - returns true if the function should never be inlined into
@@ -382,43 +341,6 @@ bool InlineCostAnalyzer::FunctionInfo::NeverInline() {
   return (Metrics.exposesReturnsTwice || Metrics.isRecursive ||
           Metrics.containsIndirectBr);
 }
-// getSpecializationBonus - The heuristic used to determine the per-call
-// performance boost for using a specialization of Callee with argument
-// specializedArgNo replaced by a constant.
-int InlineCostAnalyzer::getSpecializationBonus(Function *Callee,
-         SmallVectorImpl<unsigned> &SpecializedArgNos)
-{
-  if (Callee->mayBeOverridden())
-    return 0;
-
-  int Bonus = 0;
-  // If this function uses the coldcc calling convention, prefer not to
-  // specialize it.
-  if (Callee->getCallingConv() == CallingConv::Cold)
-    Bonus -= InlineConstants::ColdccPenalty;
-
-  // Get information about the callee.
-  FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
-
-  // If we haven't calculated this information yet, do so now.
-  if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee, TD);
-
-  unsigned ArgNo = 0;
-  unsigned i = 0;
-  for (Function::arg_iterator I = Callee->arg_begin(), E = Callee->arg_end();
-       I != E; ++I, ++ArgNo)
-    if (ArgNo == SpecializedArgNos[i]) {
-      ++i;
-      Bonus += CountBonusForConstant(I);
-    }
-
-  // Calls usually take a long time, so they make the specialization gain
-  // smaller.
-  Bonus -= CalleeFI->Metrics.NumCalls * InlineConstants::CallPenalty;
-
-  return Bonus;
-}
 
 // ConstantFunctionBonus - Figure out how much of a bonus we can get for
 // possibly devirtualizing a function. We'll subtract the size of the function
@@ -522,6 +444,15 @@ int InlineCostAnalyzer::getInlineSize(CallSite CS, Function *Callee) {
       InlineCost -= CalleeFI->ArgumentWeights[ArgNo].ConstantWeight;
   }
 
+  const DenseMap<std::pair<unsigned, unsigned>, unsigned> &ArgPairWeights
+    = CalleeFI->PointerArgPairWeights;
+  for (DenseMap<std::pair<unsigned, unsigned>, unsigned>::const_iterator I
+         = ArgPairWeights.begin(), E = ArgPairWeights.end();
+       I != E; ++I)
+    if (CS.getArgument(I->first.first)->stripInBoundsConstantOffsets() ==
+        CS.getArgument(I->first.second)->stripInBoundsConstantOffsets())
+      InlineCost -= I->second;
+
   // Each argument passed in has a cost at both the caller and the callee
   // sides.  Measurements show that each argument costs about the same as an
   // instruction.
@@ -589,22 +520,18 @@ int InlineCostAnalyzer::getInlineBonuses(CallSite CS, Function *Callee) {
 // getInlineCost - The heuristic used to determine if we should inline the
 // function call or not.
 //
-InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
-                               SmallPtrSet<const Function*, 16> &NeverInline) {
-  return getInlineCost(CS, CS.getCalledFunction(), NeverInline);
+InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS) {
+  return getInlineCost(CS, CS.getCalledFunction());
 }
 
-InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
-                               Function *Callee,
-                               SmallPtrSet<const Function*, 16> &NeverInline) {
+InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee) {
   Instruction *TheCall = CS.getInstruction();
   Function *Caller = TheCall->getParent()->getParent();
 
   // Don't inline functions which can be redefined at link-time to mean
   // something else.  Don't inline functions marked noinline or call sites
   // marked noinline.
-  if (Callee->mayBeOverridden() ||
-      Callee->hasFnAttr(Attribute::NoInline) || NeverInline.count(Callee) ||
+  if (Callee->mayBeOverridden() || Callee->hasFnAttr(Attribute::NoInline) ||
       CS.isNoInline())
     return llvm::InlineCost::getNever();
 
@@ -655,38 +582,6 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
   return llvm::InlineCost::get(InlineCost);
 }
 
-// getSpecializationCost - The heuristic used to determine the code-size
-// impact of creating a specialized version of Callee with argument
-// SpecializedArgNo replaced by a constant.
-InlineCost InlineCostAnalyzer::getSpecializationCost(Function *Callee,
-                               SmallVectorImpl<unsigned> &SpecializedArgNos)
-{
-  // Don't specialize functions which can be redefined at link-time to mean
-  // something else.
-  if (Callee->mayBeOverridden())
-    return llvm::InlineCost::getNever();
-
-  // Get information about the callee.
-  FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
-
-  // If we haven't calculated this information yet, do so now.
-  if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee, TD);
-
-  int Cost = 0;
-
-  // Look at the original size of the callee.  Each instruction counts as 5.
-  Cost += CalleeFI->Metrics.NumInsts * InlineConstants::InstrCost;
-
-  // Offset that with the amount of code that can be constant-folded
-  // away with the given arguments replaced by constants.
-  for (SmallVectorImpl<unsigned>::iterator an = SpecializedArgNos.begin(),
-       ae = SpecializedArgNos.end(); an != ae; ++an)
-    Cost -= CalleeFI->ArgumentWeights[*an].ConstantWeight;
-
-  return llvm::InlineCost::get(Cost);
-}
-
 // getInlineFudgeFactor - Return a > 1.0 factor if the inliner should use a
 // higher threshold to determine if the function call should be inlined.
 float InlineCostAnalyzer::getInlineFudgeFactor(CallSite CS) {
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 370ab96..72e33d1 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "instsimplify"
+#include "llvm/GlobalAlias.h"
 #include "llvm/Operator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -26,6 +27,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetData.h"
@@ -38,21 +40,23 @@ STATISTIC(NumExpand,  "Number of expansions");
 STATISTIC(NumFactor , "Number of factorizations");
 STATISTIC(NumReassoc, "Number of reassociations");
 
-static Value *SimplifyAndInst(Value *, Value *, const TargetData *,
-                              const TargetLibraryInfo *, const DominatorTree *,
-                              unsigned);
-static Value *SimplifyBinOp(unsigned, Value *, Value *, const TargetData *,
-                            const TargetLibraryInfo *, const DominatorTree *,
+struct Query {
+  const TargetData *TD;
+  const TargetLibraryInfo *TLI;
+  const DominatorTree *DT;
+
+  Query(const TargetData *td, const TargetLibraryInfo *tli,
+        const DominatorTree *dt) : TD(td), TLI(tli), DT(dt) {};
+};
+
+static Value *SimplifyAndInst(Value *, Value *, const Query &, unsigned);
+static Value *SimplifyBinOp(unsigned, Value *, Value *, const Query &,
                             unsigned);
-static Value *SimplifyCmpInst(unsigned, Value *, Value *, const TargetData *,
-                              const TargetLibraryInfo *, const DominatorTree *,
-                              unsigned);
-static Value *SimplifyOrInst(Value *, Value *, const TargetData *,
-                             const TargetLibraryInfo *, const DominatorTree *,
-                             unsigned);
-static Value *SimplifyXorInst(Value *, Value *, const TargetData *,
-                              const TargetLibraryInfo *, const DominatorTree *,
+static Value *SimplifyCmpInst(unsigned, Value *, Value *, const Query &,
                               unsigned);
+static Value *SimplifyOrInst(Value *, Value *, const Query &, unsigned);
+static Value *SimplifyXorInst(Value *, Value *, const Query &, unsigned);
+static Value *SimplifyTruncInst(Value *, Type *, const Query &, unsigned);
 
 /// getFalse - For a boolean type, or a vector of boolean type, return false, or
 /// a vector with every element false, as appropriate for the type.
@@ -91,10 +95,20 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
     // Arguments and constants dominate all instructions.
     return true;
 
+  // If we are processing instructions (and/or basic blocks) that have not been
+  // fully added to a function, the parent nodes may still be null. Simply
+  // return the conservative answer in these cases.
+  if (!I->getParent() || !P->getParent() || !I->getParent()->getParent())
+    return false;
+
   // If we have a DominatorTree then do a precise test.
-  if (DT)
-    return !DT->isReachableFromEntry(P->getParent()) ||
-      !DT->isReachableFromEntry(I->getParent()) || DT->dominates(I, P);
+  if (DT) {
+    if (!DT->isReachableFromEntry(P->getParent()))
+      return true;
+    if (!DT->isReachableFromEntry(I->getParent()))
+      return false;
+    return DT->dominates(I, P);
+  }
 
   // Otherwise, if the instruction is in the entry block, and is not an invoke,
   // then it obviously dominates all phi nodes.
@@ -111,8 +125,7 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
 /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
 /// Returns the simplified value, or null if no simplification was performed.
 static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                          unsigned OpcToExpand, const TargetData *TD,
-                          const TargetLibraryInfo *TLI, const DominatorTree *DT,
+                          unsigned OpcToExpand, const Query &Q,
                           unsigned MaxRecurse) {
   Instruction::BinaryOps OpcodeToExpand = (Instruction::BinaryOps)OpcToExpand;
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -125,8 +138,8 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
       // It does!  Try turning it into "(A op C) op' (B op C)".
       Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
       // Do "A op C" and "B op C" both simplify?
-      if (Value *L = SimplifyBinOp(Opcode, A, C, TD, TLI, DT, MaxRecurse))
-        if (Value *R = SimplifyBinOp(Opcode, B, C, TD, TLI, DT, MaxRecurse)) {
+      if (Value *L = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse))
+        if (Value *R = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
           // They do! Return "L op' R" if it simplifies or is already available.
           // If "L op' R" equals "A op' B" then "L op' R" is just the LHS.
           if ((L == A && R == B) || (Instruction::isCommutative(OpcodeToExpand)
@@ -135,8 +148,7 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
             return LHS;
           }
           // Otherwise return "L op' R" if it simplifies.
-          if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, TD, TLI, DT,
-                                       MaxRecurse)) {
+          if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
             ++NumExpand;
             return V;
           }
@@ -149,8 +161,8 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
       // It does!  Try turning it into "(A op B) op' (A op C)".
       Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
       // Do "A op B" and "A op C" both simplify?
-      if (Value *L = SimplifyBinOp(Opcode, A, B, TD, TLI, DT, MaxRecurse))
-        if (Value *R = SimplifyBinOp(Opcode, A, C, TD, TLI, DT, MaxRecurse)) {
+      if (Value *L = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse))
+        if (Value *R = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse)) {
           // They do! Return "L op' R" if it simplifies or is already available.
           // If "L op' R" equals "B op' C" then "L op' R" is just the RHS.
           if ((L == B && R == C) || (Instruction::isCommutative(OpcodeToExpand)
@@ -159,8 +171,7 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
             return RHS;
           }
           // Otherwise return "L op' R" if it simplifies.
-          if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, TD, TLI, DT,
-                                       MaxRecurse)) {
+          if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
             ++NumExpand;
             return V;
           }
@@ -175,9 +186,7 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 /// OpCodeToExtract is Mul then this tries to turn "(A*B)+(A*C)" into "A*(B+C)".
 /// Returns the simplified value, or null if no simplification was performed.
 static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                             unsigned OpcToExtract, const TargetData *TD, 
-                             const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT,
+                             unsigned OpcToExtract, const Query &Q,
                              unsigned MaxRecurse) {
   Instruction::BinaryOps OpcodeToExtract = (Instruction::BinaryOps)OpcToExtract;
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -202,7 +211,7 @@ static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
     Value *DD = A == C ? D : C;
     // Form "A op' (B op DD)" if it simplifies completely.
     // Does "B op DD" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, B, DD, TD, TLI, DT, MaxRecurse)) {
+    if (Value *V = SimplifyBinOp(Opcode, B, DD, Q, MaxRecurse)) {
       // It does!  Return "A op' V" if it simplifies or is already available.
       // If V equals B then "A op' V" is just the LHS.  If V equals DD then
       // "A op' V" is just the RHS.
@@ -211,8 +220,7 @@ static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
         return V == B ? LHS : RHS;
       }
       // Otherwise return "A op' V" if it simplifies.
-      if (Value *W = SimplifyBinOp(OpcodeToExtract, A, V, TD, TLI, DT,
-                                   MaxRecurse)) {
+      if (Value *W = SimplifyBinOp(OpcodeToExtract, A, V, Q, MaxRecurse)) {
         ++NumFactor;
         return W;
       }
@@ -226,7 +234,7 @@ static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
     Value *CC = B == D ? C : D;
     // Form "(A op CC) op' B" if it simplifies completely..
     // Does "A op CC" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, A, CC, TD, TLI, DT, MaxRecurse)) {
+    if (Value *V = SimplifyBinOp(Opcode, A, CC, Q, MaxRecurse)) {
       // It does!  Return "V op' B" if it simplifies or is already available.
       // If V equals A then "V op' B" is just the LHS.  If V equals CC then
       // "V op' B" is just the RHS.
@@ -235,8 +243,7 @@ static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
         return V == A ? LHS : RHS;
       }
       // Otherwise return "V op' B" if it simplifies.
-      if (Value *W = SimplifyBinOp(OpcodeToExtract, V, B, TD, TLI, DT,
-                                   MaxRecurse)) {
+      if (Value *W = SimplifyBinOp(OpcodeToExtract, V, B, Q, MaxRecurse)) {
         ++NumFactor;
         return W;
       }
@@ -249,10 +256,7 @@ static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 /// SimplifyAssociativeBinOp - Generic simplifications for associative binary
 /// operations.  Returns the simpler value, or null if none was found.
 static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
-                                       const TargetData *TD,
-                                       const TargetLibraryInfo *TLI,
-                                       const DominatorTree *DT,
-                                       unsigned MaxRecurse) {
+                                       const Query &Q, unsigned MaxRecurse) {
   Instruction::BinaryOps Opcode = (Instruction::BinaryOps)Opc;
   assert(Instruction::isAssociative(Opcode) && "Not an associative operation!");
 
@@ -270,12 +274,12 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
     Value *C = RHS;
 
     // Does "B op C" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, B, C, TD, TLI, DT, MaxRecurse)) {
+    if (Value *V = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
       // It does!  Return "A op V" if it simplifies or is already available.
       // If V equals B then "A op V" is just the LHS.
       if (V == B) return LHS;
       // Otherwise return "A op V" if it simplifies.
-      if (Value *W = SimplifyBinOp(Opcode, A, V, TD, TLI, DT, MaxRecurse)) {
+      if (Value *W = SimplifyBinOp(Opcode, A, V, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
@@ -289,12 +293,12 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
     Value *C = Op1->getOperand(1);
 
     // Does "A op B" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, A, B, TD, TLI, DT, MaxRecurse)) {
+    if (Value *V = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse)) {
       // It does!  Return "V op C" if it simplifies or is already available.
       // If V equals B then "V op C" is just the RHS.
       if (V == B) return RHS;
       // Otherwise return "V op C" if it simplifies.
-      if (Value *W = SimplifyBinOp(Opcode, V, C, TD, TLI, DT, MaxRecurse)) {
+      if (Value *W = SimplifyBinOp(Opcode, V, C, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
@@ -312,12 +316,12 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
     Value *C = RHS;
 
     // Does "C op A" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, C, A, TD, TLI, DT, MaxRecurse)) {
+    if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
       // It does!  Return "V op B" if it simplifies or is already available.
       // If V equals A then "V op B" is just the LHS.
       if (V == A) return LHS;
       // Otherwise return "V op B" if it simplifies.
-      if (Value *W = SimplifyBinOp(Opcode, V, B, TD, TLI, DT, MaxRecurse)) {
+      if (Value *W = SimplifyBinOp(Opcode, V, B, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
@@ -331,12 +335,12 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
     Value *C = Op1->getOperand(1);
 
     // Does "C op A" simplify?
-    if (Value *V = SimplifyBinOp(Opcode, C, A, TD, TLI, DT, MaxRecurse)) {
+    if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
       // It does!  Return "B op V" if it simplifies or is already available.
       // If V equals C then "B op V" is just the RHS.
       if (V == C) return RHS;
       // Otherwise return "B op V" if it simplifies.
-      if (Value *W = SimplifyBinOp(Opcode, B, V, TD, TLI, DT, MaxRecurse)) {
+      if (Value *W = SimplifyBinOp(Opcode, B, V, Q, MaxRecurse)) {
         ++NumReassoc;
         return W;
       }
@@ -351,10 +355,7 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
 /// evaluating it on both branches of the select results in the same value.
 /// Returns the common value if so, otherwise returns null.
 static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
-                                    const TargetData *TD,
-                                    const TargetLibraryInfo *TLI,
-                                    const DominatorTree *DT,
-                                    unsigned MaxRecurse) {
+                                    const Query &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return 0;
@@ -371,11 +372,11 @@ static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
   Value *TV;
   Value *FV;
   if (SI == LHS) {
-    TV = SimplifyBinOp(Opcode, SI->getTrueValue(), RHS, TD, TLI, DT, MaxRecurse);
-    FV = SimplifyBinOp(Opcode, SI->getFalseValue(), RHS, TD, TLI, DT, MaxRecurse);
+    TV = SimplifyBinOp(Opcode, SI->getTrueValue(), RHS, Q, MaxRecurse);
+    FV = SimplifyBinOp(Opcode, SI->getFalseValue(), RHS, Q, MaxRecurse);
   } else {
-    TV = SimplifyBinOp(Opcode, LHS, SI->getTrueValue(), TD, TLI, DT, MaxRecurse);
-    FV = SimplifyBinOp(Opcode, LHS, SI->getFalseValue(), TD, TLI, DT, MaxRecurse);
+    TV = SimplifyBinOp(Opcode, LHS, SI->getTrueValue(), Q, MaxRecurse);
+    FV = SimplifyBinOp(Opcode, LHS, SI->getFalseValue(), Q, MaxRecurse);
   }
 
   // If they simplified to the same value, then return the common value.
@@ -426,9 +427,7 @@ static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
 /// result in the same value.  Returns the common value if so, otherwise returns
 /// null.
 static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
-                                  Value *RHS, const TargetData *TD,
-                                  const TargetLibraryInfo *TLI,
-                                  const DominatorTree *DT,
+                                  Value *RHS, const Query &Q,
                                   unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
@@ -447,7 +446,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
 
   // Now that we have "cmp select(Cond, TV, FV), RHS", analyse it.
   // Does "cmp TV, RHS" simplify?
-  Value *TCmp = SimplifyCmpInst(Pred, TV, RHS, TD, TLI, DT, MaxRecurse);
+  Value *TCmp = SimplifyCmpInst(Pred, TV, RHS, Q, MaxRecurse);
   if (TCmp == Cond) {
     // It not only simplified, it simplified to the select condition.  Replace
     // it with 'true'.
@@ -461,7 +460,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
   }
 
   // Does "cmp FV, RHS" simplify?
-  Value *FCmp = SimplifyCmpInst(Pred, FV, RHS, TD, TLI, DT, MaxRecurse);
+  Value *FCmp = SimplifyCmpInst(Pred, FV, RHS, Q, MaxRecurse);
   if (FCmp == Cond) {
     // It not only simplified, it simplified to the select condition.  Replace
     // it with 'false'.
@@ -487,19 +486,19 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
   // is equal to "Cond && TCmp".  This also catches the case when the false
   // value simplified to false and the true value to true, returning "Cond".
   if (match(FCmp, m_Zero()))
-    if (Value *V = SimplifyAndInst(Cond, TCmp, TD, TLI, DT, MaxRecurse))
+    if (Value *V = SimplifyAndInst(Cond, TCmp, Q, MaxRecurse))
       return V;
   // If the true value simplified to true, then the result of the compare
   // is equal to "Cond || FCmp".
   if (match(TCmp, m_One()))
-    if (Value *V = SimplifyOrInst(Cond, FCmp, TD, TLI, DT, MaxRecurse))
+    if (Value *V = SimplifyOrInst(Cond, FCmp, Q, MaxRecurse))
       return V;
   // Finally, if the false value simplified to true and the true value to
   // false, then the result of the compare is equal to "!Cond".
   if (match(FCmp, m_One()) && match(TCmp, m_Zero()))
     if (Value *V =
         SimplifyXorInst(Cond, Constant::getAllOnesValue(Cond->getType()),
-                        TD, TLI, DT, MaxRecurse))
+                        Q, MaxRecurse))
       return V;
 
   return 0;
@@ -510,10 +509,7 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
 /// it on the incoming phi values yields the same result for every value.  If so
 /// returns the common value, otherwise returns null.
 static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
-                                 const TargetData *TD,
-                                 const TargetLibraryInfo *TLI, 
-                                 const DominatorTree *DT,
-                                 unsigned MaxRecurse) {
+                                 const Query &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return 0;
@@ -522,13 +518,13 @@ static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
   if (isa<PHINode>(LHS)) {
     PI = cast<PHINode>(LHS);
     // Bail out if RHS and the phi may be mutually interdependent due to a loop.
-    if (!ValueDominatesPHI(RHS, PI, DT))
+    if (!ValueDominatesPHI(RHS, PI, Q.DT))
       return 0;
   } else {
     assert(isa<PHINode>(RHS) && "No PHI instruction operand!");
     PI = cast<PHINode>(RHS);
     // Bail out if LHS and the phi may be mutually interdependent due to a loop.
-    if (!ValueDominatesPHI(LHS, PI, DT))
+    if (!ValueDominatesPHI(LHS, PI, Q.DT))
       return 0;
   }
 
@@ -539,8 +535,8 @@ static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PI) continue;
     Value *V = PI == LHS ?
-      SimplifyBinOp(Opcode, Incoming, RHS, TD, TLI, DT, MaxRecurse) :
-      SimplifyBinOp(Opcode, LHS, Incoming, TD, TLI, DT, MaxRecurse);
+      SimplifyBinOp(Opcode, Incoming, RHS, Q, MaxRecurse) :
+      SimplifyBinOp(Opcode, LHS, Incoming, Q, MaxRecurse);
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
     if (!V || (CommonValue && V != CommonValue))
@@ -556,10 +552,7 @@ static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
 /// incoming phi values yields the same result every time.  If so returns the
 /// common result, otherwise returns null.
 static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
-                               const TargetData *TD,
-                               const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT,
-                               unsigned MaxRecurse) {
+                               const Query &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return 0;
@@ -573,7 +566,7 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   PHINode *PI = cast<PHINode>(LHS);
 
   // Bail out if RHS and the phi may be mutually interdependent due to a loop.
-  if (!ValueDominatesPHI(RHS, PI, DT))
+  if (!ValueDominatesPHI(RHS, PI, Q.DT))
     return 0;
 
   // Evaluate the BinOp on the incoming phi values.
@@ -582,7 +575,7 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
     Value *Incoming = PI->getIncomingValue(i);
     // If the incoming value is the phi node itself, it can safely be skipped.
     if (Incoming == PI) continue;
-    Value *V = SimplifyCmpInst(Pred, Incoming, RHS, TD, TLI, DT, MaxRecurse);
+    Value *V = SimplifyCmpInst(Pred, Incoming, RHS, Q, MaxRecurse);
     // If the operation failed to simplify, or simplified to a different value
     // to previously, then give up.
     if (!V || (CommonValue && V != CommonValue))
@@ -596,15 +589,12 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
 /// SimplifyAddInst - Given operands for an Add, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                              const TargetData *TD,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              unsigned MaxRecurse) {
+                              const Query &Q, unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
-      return ConstantFoldInstOperands(Instruction::Add, CLHS->getType(),
-                                      Ops, TD, TLI);
+      return ConstantFoldInstOperands(Instruction::Add, CLHS->getType(), Ops,
+                                      Q.TD, Q.TLI);
     }
 
     // Canonicalize the constant to the RHS.
@@ -634,17 +624,17 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 
   /// i1 add -> xor.
   if (MaxRecurse && Op0->getType()->isIntegerTy(1))
-    if (Value *V = SimplifyXorInst(Op0, Op1, TD, TLI, DT, MaxRecurse-1))
+    if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::Add, Op0, Op1, TD, TLI, DT,
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::Add, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Mul distributes over Add.  Try some generic simplifications based on this.
   if (Value *V = FactorizeBinOp(Instruction::Add, Op0, Op1, Instruction::Mul,
-                                TD, TLI, DT, MaxRecurse))
+                                Q, MaxRecurse))
     return V;
 
   // Threading Add over selects and phi nodes is pointless, so don't bother.
@@ -662,21 +652,114 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const TargetData *TD, const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
-  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query (TD, TLI, DT),
+                           RecursionLimit);
+}
+
+/// \brief Accumulate the constant integer offset a GEP represents.
+///
+/// Given a getelementptr instruction/constantexpr, accumulate the constant
+/// offset from the base pointer into the provided APInt 'Offset'. Returns true
+/// if the GEP has all-constant indices. Returns false if any non-constant
+/// index is encountered leaving the 'Offset' in an undefined state. The
+/// 'Offset' APInt must be the bitwidth of the target's pointer size.
+static bool accumulateGEPOffset(const TargetData &TD, GEPOperator *GEP,
+                                APInt &Offset) {
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  assert(IntPtrWidth == Offset.getBitWidth());
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); I != E;
+       ++I, ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(*I);
+    if (!OpC) return false;
+    if (OpC->isZero()) continue;
+
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+      unsigned ElementIdx = OpC->getZExtValue();
+      const StructLayout *SL = TD.getStructLayout(STy);
+      Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
+      continue;
+    }
+
+    APInt TypeSize(IntPtrWidth, TD.getTypeAllocSize(GTI.getIndexedType()));
+    Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
+  }
+  return true;
+}
+
+/// \brief Compute the base pointer and cumulative constant offsets for V.
+///
+/// This strips all constant offsets off of V, leaving it the base pointer, and
+/// accumulates the total constant offset applied in the returned constant. It
+/// returns 0 if V is not a pointer, and returns the constant '0' if there are
+/// no constant offsets applied.
+static Constant *stripAndComputeConstantOffsets(const TargetData &TD,
+                                                Value *&V) {
+  if (!V->getType()->isPointerTy())
+    return 0;
+
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  APInt Offset = APInt::getNullValue(IntPtrWidth);
+
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(V);
+  do {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      if (!accumulateGEPOffset(TD, GEP, Offset))
+        break;
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      if (GA->mayBeOverridden())
+        break;
+      V = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(V));
+
+  Type *IntPtrTy = TD.getIntPtrType(V->getContext());
+  return ConstantInt::get(IntPtrTy, Offset);
+}
+
+/// \brief Compute the constant difference between two pointer values.
+/// If the difference is not a constant, returns zero.
+static Constant *computePointerDifference(const TargetData &TD,
+                                          Value *LHS, Value *RHS) {
+  Constant *LHSOffset = stripAndComputeConstantOffsets(TD, LHS);
+  if (!LHSOffset)
+    return 0;
+  Constant *RHSOffset = stripAndComputeConstantOffsets(TD, RHS);
+  if (!RHSOffset)
+    return 0;
+
+  // If LHS and RHS are not related via constant offsets to the same base
+  // value, there is nothing we can do here.
+  if (LHS != RHS)
+    return 0;
+
+  // Otherwise, the difference of LHS - RHS can be computed as:
+  //    LHS - RHS
+  //  = (LHSOffset + Base) - (RHSOffset + Base)
+  //  = LHSOffset - RHSOffset
+  return ConstantExpr::getSub(LHSOffset, RHSOffset);
 }
 
 /// SimplifySubInst - Given operands for a Sub, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                              const TargetData *TD,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              unsigned MaxRecurse) {
+                              const Query &Q, unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0))
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::Sub, CLHS->getType(),
-                                      Ops, TD, TLI);
+                                      Ops, Q.TD, Q.TLI);
     }
 
   // X - undef -> undef
@@ -704,19 +787,17 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   Value *Y = 0, *Z = Op1;
   if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z
     // See if "V === Y - Z" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, TD, TLI, DT, MaxRecurse-1))
+    if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1))
       // It does!  Now see if "X + V" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Add, X, V, TD, TLI, DT,
-                                   MaxRecurse-1)) {
+      if (Value *W = SimplifyBinOp(Instruction::Add, X, V, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
     // See if "V === X - Z" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, TD, TLI, DT, MaxRecurse-1))
+    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
       // It does!  Now see if "Y + V" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Add, Y, V, TD, TLI, DT,
-                                   MaxRecurse-1)) {
+      if (Value *W = SimplifyBinOp(Instruction::Add, Y, V, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
@@ -728,19 +809,17 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   X = Op0;
   if (MaxRecurse && match(Op1, m_Add(m_Value(Y), m_Value(Z)))) { // X - (Y + Z)
     // See if "V === X - Y" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, TD, TLI, DT, MaxRecurse-1))
+    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
       // It does!  Now see if "V - Z" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Sub, V, Z, TD, TLI, DT,
-                                   MaxRecurse-1)) {
+      if (Value *W = SimplifyBinOp(Instruction::Sub, V, Z, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
     // See if "V === X - Z" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, TD, TLI, DT, MaxRecurse-1))
+    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
       // It does!  Now see if "V - Y" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Sub, V, Y, TD, TLI, DT,
-                                   MaxRecurse-1)) {
+      if (Value *W = SimplifyBinOp(Instruction::Sub, V, Y, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
@@ -752,23 +831,39 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
   Z = Op0;
   if (MaxRecurse && match(Op1, m_Sub(m_Value(X), m_Value(Y)))) // Z - (X - Y)
     // See if "V === Z - X" simplifies.
-    if (Value *V = SimplifyBinOp(Instruction::Sub, Z, X, TD, TLI, DT, MaxRecurse-1))
+    if (Value *V = SimplifyBinOp(Instruction::Sub, Z, X, Q, MaxRecurse-1))
       // It does!  Now see if "V + Y" simplifies.
-      if (Value *W = SimplifyBinOp(Instruction::Add, V, Y, TD, TLI, DT,
-                                   MaxRecurse-1)) {
+      if (Value *W = SimplifyBinOp(Instruction::Add, V, Y, Q, MaxRecurse-1)) {
         // It does, we successfully reassociated!
         ++NumReassoc;
         return W;
       }
 
+  // trunc(X) - trunc(Y) -> trunc(X - Y) if everything simplifies.
+  if (MaxRecurse && match(Op0, m_Trunc(m_Value(X))) &&
+      match(Op1, m_Trunc(m_Value(Y))))
+    if (X->getType() == Y->getType())
+      // See if "V === X - Y" simplifies.
+      if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
+        // It does!  Now see if "trunc V" simplifies.
+        if (Value *W = SimplifyTruncInst(V, Op0->getType(), Q, MaxRecurse-1))
+          // It does, return the simplified "trunc V".
+          return W;
+
+  // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
+  if (Q.TD && match(Op0, m_PtrToInt(m_Value(X))) &&
+      match(Op1, m_PtrToInt(m_Value(Y))))
+    if (Constant *Result = computePointerDifference(*Q.TD, X, Y))
+      return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
+
   // Mul distributes over Sub.  Try some generic simplifications based on this.
   if (Value *V = FactorizeBinOp(Instruction::Sub, Op0, Op1, Instruction::Mul,
-                                TD, TLI, DT, MaxRecurse))
+                                Q, MaxRecurse))
     return V;
 
   // i1 sub -> xor.
   if (MaxRecurse && Op0->getType()->isIntegerTy(1))
-    if (Value *V = SimplifyXorInst(Op0, Op1, TD, TLI, DT, MaxRecurse-1))
+    if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
   // Threading Sub over selects and phi nodes is pointless, so don't bother.
@@ -784,22 +879,21 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 }
 
 Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                             const TargetData *TD,
-                             const TargetLibraryInfo *TLI,
+                             const TargetData *TD, const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
-  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, TD, TLI, DT, RecursionLimit);
+  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Query (TD, TLI, DT),
+                           RecursionLimit);
 }
 
 /// SimplifyMulInst - Given operands for a Mul, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyMulInst(Value *Op0, Value *Op1, const TargetData *TD,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, unsigned MaxRecurse) {
+static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
+                              unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::Mul, CLHS->getType(),
-                                      Ops, TD, TLI);
+                                      Ops, Q.TD, Q.TLI);
     }
 
     // Canonicalize the constant to the RHS.
@@ -826,30 +920,30 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const TargetData *TD,
 
   // i1 mul -> and.
   if (MaxRecurse && Op0->getType()->isIntegerTy(1))
-    if (Value *V = SimplifyAndInst(Op0, Op1, TD, TLI, DT, MaxRecurse-1))
+    if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::Mul, Op0, Op1, TD, TLI, DT,
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::Mul, Op0, Op1, Q,
                                           MaxRecurse))
     return V;
 
   // Mul distributes over Add.  Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::Mul, Op0, Op1, Instruction::Add,
-                             TD, TLI, DT, MaxRecurse))
+                             Q, MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Instruction::Mul, Op0, Op1, TD, TLI, DT,
+    if (Value *V = ThreadBinOpOverSelect(Instruction::Mul, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Instruction::Mul, Op0, Op1, TD, TLI, DT,
+    if (Value *V = ThreadBinOpOverPHI(Instruction::Mul, Op0, Op1, Q,
                                       MaxRecurse))
       return V;
 
@@ -859,18 +953,17 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const TargetData *TD,
 Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const TargetData *TD,
                              const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
-  return ::SimplifyMulInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyMulInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
 /// SimplifyDiv - Given operands for an SDiv or UDiv, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
-                          const TargetData *TD, const TargetLibraryInfo *TLI,
-                          const DominatorTree *DT, unsigned MaxRecurse) {
+                          const Query &Q, unsigned MaxRecurse) {
   if (Constant *C0 = dyn_cast<Constant>(Op0)) {
     if (Constant *C1 = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { C0, C1 };
-      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, TD, TLI);
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, Q.TD, Q.TLI);
     }
   }
 
@@ -923,15 +1016,13 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, TD, TLI, DT,
-                                         MaxRecurse))
+    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, TD, TLI, DT,
-                                      MaxRecurse))
+    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   return 0;
@@ -939,11 +1030,9 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
 
 /// SimplifySDivInst - Given operands for an SDiv, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifySDivInst(Value *Op0, Value *Op1, const TargetData *TD,
-                               const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT, unsigned MaxRecurse) {
-  if (Value *V = SimplifyDiv(Instruction::SDiv, Op0, Op1, TD, TLI, DT,
-                             MaxRecurse))
+static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q,
+                               unsigned MaxRecurse) {
+  if (Value *V = SimplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse))
     return V;
 
   return 0;
@@ -952,16 +1041,14 @@ static Value *SimplifySDivInst(Value *Op0, Value *Op1, const TargetData *TD,
 Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifySDivInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifySDivInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
 /// SimplifyUDivInst - Given operands for a UDiv, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const TargetData *TD,
-                               const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT, unsigned MaxRecurse) {
-  if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, TD, TLI, DT,
-                             MaxRecurse))
+static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
+                               unsigned MaxRecurse) {
+  if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse))
     return V;
 
   return 0;
@@ -970,12 +1057,11 @@ static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const TargetData *TD,
 Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifyUDivInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyUDivInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
-static Value *SimplifyFDivInst(Value *Op0, Value *Op1, const TargetData *,
-                               const TargetLibraryInfo *,
-                               const DominatorTree *, unsigned) {
+static Value *SimplifyFDivInst(Value *Op0, Value *Op1, const Query &Q,
+                               unsigned) {
   // undef / X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
     return Op0;
@@ -990,18 +1076,17 @@ static Value *SimplifyFDivInst(Value *Op0, Value *Op1, const TargetData *,
 Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifyFDivInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyFDivInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
 /// SimplifyRem - Given operands for an SRem or URem, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
-                          const TargetData *TD, const TargetLibraryInfo *TLI,
-                          const DominatorTree *DT, unsigned MaxRecurse) {
+                          const Query &Q, unsigned MaxRecurse) {
   if (Constant *C0 = dyn_cast<Constant>(Op0)) {
     if (Constant *C1 = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { C0, C1 };
-      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, TD, TLI);
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, Q.TD, Q.TLI);
     }
   }
 
@@ -1036,13 +1121,13 @@ static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, TD, TLI, DT, MaxRecurse))
+    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, TD, TLI, DT, MaxRecurse))
+    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   return 0;
@@ -1050,11 +1135,9 @@ static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
 
 /// SimplifySRemInst - Given operands for an SRem, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifySRemInst(Value *Op0, Value *Op1, const TargetData *TD,
-                               const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT,
-                               unsigned MaxRecurse) {                               
-  if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, TD, TLI, DT, MaxRecurse))
+static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q,
+                               unsigned MaxRecurse) {
+  if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse))
     return V;
 
   return 0;
@@ -1063,16 +1146,14 @@ static Value *SimplifySRemInst(Value *Op0, Value *Op1, const TargetData *TD,
 Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifySRemInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifySRemInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
 /// SimplifyURemInst - Given operands for a URem, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyURemInst(Value *Op0, Value *Op1, const TargetData *TD,
-                               const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT,
+static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
                                unsigned MaxRecurse) {
-  if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, TD, TLI, DT, MaxRecurse))
+  if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse))
     return V;
 
   return 0;
@@ -1081,12 +1162,10 @@ static Value *SimplifyURemInst(Value *Op0, Value *Op1, const TargetData *TD,
 Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifyURemInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyURemInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
-static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const TargetData *,
-                               const TargetLibraryInfo *,
-                               const DominatorTree *,
+static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const Query &,
                                unsigned) {
   // undef % X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
@@ -1102,18 +1181,17 @@ static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const TargetData *,
 Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifyFRemInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyFRemInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
 /// SimplifyShift - Given operands for an Shl, LShr or AShr, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
-                            const TargetData *TD, const TargetLibraryInfo *TLI,
-                            const DominatorTree *DT, unsigned MaxRecurse) {
+                            const Query &Q, unsigned MaxRecurse) {
   if (Constant *C0 = dyn_cast<Constant>(Op0)) {
     if (Constant *C1 = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { C0, C1 };
-      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, TD, TLI);
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, Q.TD, Q.TLI);
     }
   }
 
@@ -1138,13 +1216,13 @@ static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, TD, TLI, DT, MaxRecurse))
+    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, TD, TLI, DT, MaxRecurse))
+    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
       return V;
 
   return 0;
@@ -1153,10 +1231,8 @@ static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
 /// SimplifyShlInst - Given operands for an Shl, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                              const TargetData *TD,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, unsigned MaxRecurse) {
-  if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, TD, TLI, DT, MaxRecurse))
+                              const Query &Q, unsigned MaxRecurse) {
+  if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // undef << X -> 0
@@ -1173,17 +1249,15 @@ static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const TargetData *TD, const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
-  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Query (TD, TLI, DT),
+                           RecursionLimit);
 }
 
 /// SimplifyLShrInst - Given operands for an LShr, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
-                               const TargetData *TD,
-                               const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT,
-                               unsigned MaxRecurse) {
-  if (Value *V = SimplifyShift(Instruction::LShr, Op0, Op1, TD, TLI, DT, MaxRecurse))
+                               const Query &Q, unsigned MaxRecurse) {
+  if (Value *V = SimplifyShift(Instruction::LShr, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // undef >>l X -> 0
@@ -1203,17 +1277,15 @@ Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                               const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifyLShrInst(Op0, Op1, isExact, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyLShrInst(Op0, Op1, isExact, Query (TD, TLI, DT),
+                            RecursionLimit);
 }
 
 /// SimplifyAShrInst - Given operands for an AShr, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
-                               const TargetData *TD,
-                               const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT,
-                               unsigned MaxRecurse) {
-  if (Value *V = SimplifyShift(Instruction::AShr, Op0, Op1, TD, TLI, DT, MaxRecurse))
+                               const Query &Q, unsigned MaxRecurse) {
+  if (Value *V = SimplifyShift(Instruction::AShr, Op0, Op1, Q, MaxRecurse))
     return V;
 
   // all ones >>a X -> all ones
@@ -1237,20 +1309,19 @@ Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                               const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifyAShrInst(Op0, Op1, isExact, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyAShrInst(Op0, Op1, isExact, Query (TD, TLI, DT),
+                            RecursionLimit);
 }
 
 /// SimplifyAndInst - Given operands for an And, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyAndInst(Value *Op0, Value *Op1, const TargetData *TD, 
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
+static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::And, CLHS->getType(),
-                                      Ops, TD, TLI);
+                                      Ops, Q.TD, Q.TLI);
     }
 
     // Canonicalize the constant to the RHS.
@@ -1292,43 +1363,43 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const TargetData *TD,
   // A & (-A) = A if A is a power of two or zero.
   if (match(Op0, m_Neg(m_Specific(Op1))) ||
       match(Op1, m_Neg(m_Specific(Op0)))) {
-    if (isPowerOfTwo(Op0, TD, /*OrZero*/true))
+    if (isPowerOfTwo(Op0, Q.TD, /*OrZero*/true))
       return Op0;
-    if (isPowerOfTwo(Op1, TD, /*OrZero*/true))
+    if (isPowerOfTwo(Op1, Q.TD, /*OrZero*/true))
       return Op1;
   }
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, TD, TLI,
-                                          DT, MaxRecurse))
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q,
+                                          MaxRecurse))
     return V;
 
   // And distributes over Or.  Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Or,
-                             TD, TLI, DT, MaxRecurse))
+                             Q, MaxRecurse))
     return V;
 
   // And distributes over Xor.  Try some generic simplifications based on this.
   if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Xor,
-                             TD, TLI, DT, MaxRecurse))
+                             Q, MaxRecurse))
     return V;
 
   // Or distributes over And.  Try some generic simplifications based on this.
   if (Value *V = FactorizeBinOp(Instruction::And, Op0, Op1, Instruction::Or,
-                                TD, TLI, DT, MaxRecurse))
+                                Q, MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Instruction::And, Op0, Op1, TD, TLI,
-                                         DT, MaxRecurse))
+    if (Value *V = ThreadBinOpOverSelect(Instruction::And, Op0, Op1, Q,
+                                         MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Instruction::And, Op0, Op1, TD, TLI, DT,
+    if (Value *V = ThreadBinOpOverPHI(Instruction::And, Op0, Op1, Q,
                                       MaxRecurse))
       return V;
 
@@ -1338,19 +1409,18 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const TargetData *TD,
 Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const TargetData *TD,
                              const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
-  return ::SimplifyAndInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyAndInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
 /// SimplifyOrInst - Given operands for an Or, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyOrInst(Value *Op0, Value *Op1, const TargetData *TD, 
-                             const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, unsigned MaxRecurse) {
+static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
+                             unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::Or, CLHS->getType(),
-                                      Ops, TD, TLI);
+                                      Ops, Q.TD, Q.TLI);
     }
 
     // Canonicalize the constant to the RHS.
@@ -1400,32 +1470,31 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const TargetData *TD,
     return Constant::getAllOnesValue(Op0->getType());
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, TD, TLI,
-                                          DT, MaxRecurse))
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q,
+                                          MaxRecurse))
     return V;
 
   // Or distributes over And.  Try some generic simplifications based on this.
-  if (Value *V = ExpandBinOp(Instruction::Or, Op0, Op1, Instruction::And, TD,
-                             TLI, DT, MaxRecurse))
+  if (Value *V = ExpandBinOp(Instruction::Or, Op0, Op1, Instruction::And, Q,
+                             MaxRecurse))
     return V;
 
   // And distributes over Or.  Try some generic simplifications based on this.
   if (Value *V = FactorizeBinOp(Instruction::Or, Op0, Op1, Instruction::And,
-                                TD, TLI, DT, MaxRecurse))
+                                Q, MaxRecurse))
     return V;
 
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
-    if (Value *V = ThreadBinOpOverSelect(Instruction::Or, Op0, Op1, TD, TLI, DT,
+    if (Value *V = ThreadBinOpOverSelect(Instruction::Or, Op0, Op1, Q,
                                          MaxRecurse))
       return V;
 
   // If the operation is with the result of a phi instruction, check whether
   // operating on all incoming values of the phi always yields the same value.
   if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
-    if (Value *V = ThreadBinOpOverPHI(Instruction::Or, Op0, Op1, TD, TLI, DT,
-                                      MaxRecurse))
+    if (Value *V = ThreadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse))
       return V;
 
   return 0;
@@ -1434,19 +1503,18 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const TargetData *TD,
 Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const TargetData *TD,
                             const TargetLibraryInfo *TLI,
                             const DominatorTree *DT) {
-  return ::SimplifyOrInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyOrInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
 /// SimplifyXorInst - Given operands for a Xor, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyXorInst(Value *Op0, Value *Op1, const TargetData *TD,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, unsigned MaxRecurse) {
+static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
+                              unsigned MaxRecurse) {
   if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::Xor, CLHS->getType(),
-                                      Ops, TD, TLI);
+                                      Ops, Q.TD, Q.TLI);
     }
 
     // Canonicalize the constant to the RHS.
@@ -1471,13 +1539,13 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const TargetData *TD,
     return Constant::getAllOnesValue(Op0->getType());
 
   // Try some generic simplifications for associative operations.
-  if (Value *V = SimplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, TD, TLI,
-                                          DT, MaxRecurse))
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, Q,
+                                          MaxRecurse))
     return V;
 
   // And distributes over Xor.  Try some generic simplifications based on this.
   if (Value *V = FactorizeBinOp(Instruction::Xor, Op0, Op1, Instruction::And,
-                                TD, TLI, DT, MaxRecurse))
+                                Q, MaxRecurse))
     return V;
 
   // Threading Xor over selects and phi nodes is pointless, so don't bother.
@@ -1495,7 +1563,7 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const TargetData *TD,
 Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const TargetData *TD,
                              const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
-  return ::SimplifyXorInst(Op0, Op1, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyXorInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
 }
 
 static Type *GetCompareTy(Value *Op) {
@@ -1522,42 +1590,17 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
   return 0;
 }
 
-/// stripPointerAdjustments - This is like Value::stripPointerCasts, but also
-/// removes inbounds gep operations, regardless of their indices.
-static Value *stripPointerAdjustmentsImpl(Value *V,
-                                    SmallPtrSet<GEPOperator*, 8> &VisitedGEPs) {
-  GEPOperator *GEP = dyn_cast<GEPOperator>(V);
-  if (GEP == 0 || !GEP->isInBounds())
-    return V;
-
-  // If we've already seen this GEP, we will end up infinitely looping.  This
-  // can happen in unreachable code.
-  if (!VisitedGEPs.insert(GEP))
-    return V;
-  
-  return stripPointerAdjustmentsImpl(GEP->getOperand(0)->stripPointerCasts(),
-                                     VisitedGEPs);
-}
-
-static Value *stripPointerAdjustments(Value *V) {
-  SmallPtrSet<GEPOperator*, 8> VisitedGEPs;
-  return stripPointerAdjustmentsImpl(V, VisitedGEPs);
-}
-
 
 /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                               const TargetData *TD,
-                               const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT,
-                               unsigned MaxRecurse) {
+                               const Query &Q, unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!");
 
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
     if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, TD, TLI);
+      return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.TD, Q.TLI);
 
     // If we have a constant, make sure it is on the RHS.
     std::swap(LHS, RHS);
@@ -1625,29 +1668,39 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // Be more aggressive about stripping pointer adjustments when checking a
   // comparison of an alloca address to another object.  We can rip off all
   // inbounds GEP operations, even if they are variable.
-  LHSPtr = stripPointerAdjustments(LHSPtr);
+  LHSPtr = LHSPtr->stripInBoundsOffsets();
   if (llvm::isIdentifiedObject(LHSPtr)) {
-    RHSPtr = stripPointerAdjustments(RHSPtr);
+    RHSPtr = RHSPtr->stripInBoundsOffsets();
     if (llvm::isKnownNonNull(LHSPtr) || llvm::isKnownNonNull(RHSPtr)) {
       // If both sides are different identified objects, they aren't equal
       // unless they're null.
-      if (LHSPtr != RHSPtr && llvm::isIdentifiedObject(RHSPtr))
-        return ConstantInt::get(ITy, CmpInst::isFalseWhenEqual(Pred));
+      if (LHSPtr != RHSPtr && llvm::isIdentifiedObject(RHSPtr) &&
+          Pred == CmpInst::ICMP_EQ)
+        return ConstantInt::get(ITy, false);
 
       // A local identified object (alloca or noalias call) can't equal any
       // incoming argument, unless they're both null.
-      if (isa<Instruction>(LHSPtr) && isa<Argument>(RHSPtr))
-        return ConstantInt::get(ITy, CmpInst::isFalseWhenEqual(Pred));
+      if (isa<Instruction>(LHSPtr) && isa<Argument>(RHSPtr) &&
+          Pred == CmpInst::ICMP_EQ)
+        return ConstantInt::get(ITy, false);
     }
 
     // Assume that the constant null is on the right.
-    if (llvm::isKnownNonNull(LHSPtr) && isa<ConstantPointerNull>(RHSPtr))
-      return ConstantInt::get(ITy, CmpInst::isFalseWhenEqual(Pred));
+    if (llvm::isKnownNonNull(LHSPtr) && isa<ConstantPointerNull>(RHSPtr)) {
+      if (Pred == CmpInst::ICMP_EQ)
+        return ConstantInt::get(ITy, false);
+      else if (Pred == CmpInst::ICMP_NE)
+        return ConstantInt::get(ITy, true);
+    }
   } else if (isa<Argument>(LHSPtr)) {
-    RHSPtr = stripPointerAdjustments(RHSPtr);
+    RHSPtr = RHSPtr->stripInBoundsOffsets();
     // An alloca can't be equal to an argument.
-    if (isa<AllocaInst>(RHSPtr))
-      return ConstantInt::get(ITy, CmpInst::isFalseWhenEqual(Pred));
+    if (isa<AllocaInst>(RHSPtr)) {
+      if (Pred == CmpInst::ICMP_EQ)
+        return ConstantInt::get(ITy, false);
+      else if (Pred == CmpInst::ICMP_NE)
+        return ConstantInt::get(ITy, true);
+    }
   }
 
   // If we are comparing with zero then try hard since this is a common case.
@@ -1661,40 +1714,40 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getTrue(ITy);
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULE:
-      if (isKnownNonZero(LHS, TD))
+      if (isKnownNonZero(LHS, Q.TD))
         return getFalse(ITy);
       break;
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
-      if (isKnownNonZero(LHS, TD))
+      if (isKnownNonZero(LHS, Q.TD))
         return getTrue(ITy);
       break;
     case ICmpInst::ICMP_SLT:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.TD);
       if (LHSKnownNegative)
         return getTrue(ITy);
       if (LHSKnownNonNegative)
         return getFalse(ITy);
       break;
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.TD);
       if (LHSKnownNegative)
         return getTrue(ITy);
-      if (LHSKnownNonNegative && isKnownNonZero(LHS, TD))
+      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.TD))
         return getFalse(ITy);
       break;
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.TD);
       if (LHSKnownNegative)
         return getFalse(ITy);
       if (LHSKnownNonNegative)
         return getTrue(ITy);
       break;
     case ICmpInst::ICMP_SGT:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.TD);
       if (LHSKnownNegative)
         return getFalse(ITy);
-      if (LHSKnownNonNegative && isKnownNonZero(LHS, TD))
+      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.TD))
         return getTrue(ITy);
       break;
     }
@@ -1777,19 +1830,19 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 
     // Turn icmp (ptrtoint x), (ptrtoint/constant) into a compare of the input
     // if the integer type is the same size as the pointer type.
-    if (MaxRecurse && TD && isa<PtrToIntInst>(LI) &&
-        TD->getPointerSizeInBits() == DstTy->getPrimitiveSizeInBits()) {
+    if (MaxRecurse && Q.TD && isa<PtrToIntInst>(LI) &&
+        Q.TD->getPointerSizeInBits() == DstTy->getPrimitiveSizeInBits()) {
       if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
         // Transfer the cast to the constant.
         if (Value *V = SimplifyICmpInst(Pred, SrcOp,
                                         ConstantExpr::getIntToPtr(RHSC, SrcTy),
-                                        TD, TLI, DT, MaxRecurse-1))
+                                        Q, MaxRecurse-1))
           return V;
       } else if (PtrToIntInst *RI = dyn_cast<PtrToIntInst>(RHS)) {
         if (RI->getOperand(0)->getType() == SrcTy)
           // Compare without the cast.
           if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
-                                          TD, TLI, DT, MaxRecurse-1))
+                                          Q, MaxRecurse-1))
             return V;
       }
     }
@@ -1801,7 +1854,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
           // Compare X and Y.  Note that signed predicates become unsigned.
           if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
-                                          SrcOp, RI->getOperand(0), TD, TLI, DT,
+                                          SrcOp, RI->getOperand(0), Q,
                                           MaxRecurse-1))
             return V;
       }
@@ -1817,7 +1870,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         // also a case of comparing two zero-extended values.
         if (RExt == CI && MaxRecurse)
           if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
-                                        SrcOp, Trunc, TD, TLI, DT, MaxRecurse-1))
+                                        SrcOp, Trunc, Q, MaxRecurse-1))
             return V;
 
         // Otherwise the upper bits of LHS are zero while RHS has a non-zero bit
@@ -1861,7 +1914,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
           // Compare X and Y.  Note that the predicate does not change.
           if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
-                                          TD, TLI, DT, MaxRecurse-1))
+                                          Q, MaxRecurse-1))
             return V;
       }
       // Turn icmp (sext X), Cst into a compare of X and Cst if Cst is extended
@@ -1875,8 +1928,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         // If the re-extended constant didn't change then this is effectively
         // also a case of comparing two sign-extended values.
         if (RExt == CI && MaxRecurse)
-          if (Value *V = SimplifyICmpInst(Pred, SrcOp, Trunc, TD, TLI, DT,
-                                          MaxRecurse-1))
+          if (Value *V = SimplifyICmpInst(Pred, SrcOp, Trunc, Q, MaxRecurse-1))
             return V;
 
         // Otherwise the upper bits of LHS are all equal, while RHS has varying
@@ -1910,7 +1962,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
             if (MaxRecurse)
               if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SLT, SrcOp,
                                               Constant::getNullValue(SrcTy),
-                                              TD, TLI, DT, MaxRecurse-1))
+                                              Q, MaxRecurse-1))
                 return V;
             break;
           case ICmpInst::ICMP_ULT:
@@ -1919,7 +1971,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
             if (MaxRecurse)
               if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SGE, SrcOp,
                                               Constant::getNullValue(SrcTy),
-                                              TD, TLI, DT, MaxRecurse-1))
+                                              Q, MaxRecurse-1))
                 return V;
             break;
           }
@@ -1953,14 +2005,14 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     if ((A == RHS || B == RHS) && NoLHSWrapProblem)
       if (Value *V = SimplifyICmpInst(Pred, A == RHS ? B : A,
                                       Constant::getNullValue(RHS->getType()),
-                                      TD, TLI, DT, MaxRecurse-1))
+                                      Q, MaxRecurse-1))
         return V;
 
     // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
     if ((C == LHS || D == LHS) && NoRHSWrapProblem)
       if (Value *V = SimplifyICmpInst(Pred,
                                       Constant::getNullValue(LHS->getType()),
-                                      C == LHS ? D : C, TD, TLI, DT, MaxRecurse-1))
+                                      C == LHS ? D : C, Q, MaxRecurse-1))
         return V;
 
     // icmp (X+Y), (X+Z) -> icmp Y,Z for equalities or if there is no overflow.
@@ -1969,7 +2021,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       // Determine Y and Z in the form icmp (X+Y), (X+Z).
       Value *Y = (A == C || A == D) ? B : A;
       Value *Z = (C == A || C == B) ? D : C;
-      if (Value *V = SimplifyICmpInst(Pred, Y, Z, TD, TLI, DT, MaxRecurse-1))
+      if (Value *V = SimplifyICmpInst(Pred, Y, Z, Q, MaxRecurse-1))
         return V;
     }
   }
@@ -1981,7 +2033,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, TD);
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.TD);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -1991,7 +2043,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, TD);
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.TD);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2008,7 +2060,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, TD);
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.TD);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2018,7 +2070,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, TD);
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.TD);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2052,7 +2104,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       if (!LBO->isExact() || !RBO->isExact())
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
-                                      RBO->getOperand(0), TD, TLI, DT, MaxRecurse-1))
+                                      RBO->getOperand(0), Q, MaxRecurse-1))
         return V;
       break;
     case Instruction::Shl: {
@@ -2063,7 +2115,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       if (!NSW && ICmpInst::isSigned(Pred))
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
-                                      RBO->getOperand(0), TD, TLI, DT, MaxRecurse-1))
+                                      RBO->getOperand(0), Q, MaxRecurse-1))
         return V;
       break;
     }
@@ -2117,7 +2169,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         return V;
       // Otherwise, see if "A EqP B" simplifies.
       if (MaxRecurse)
-        if (Value *V = SimplifyICmpInst(EqP, A, B, TD, TLI, DT, MaxRecurse-1))
+        if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse-1))
           return V;
       break;
     case CmpInst::ICMP_NE:
@@ -2131,7 +2183,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         return V;
       // Otherwise, see if "A InvEqP B" simplifies.
       if (MaxRecurse)
-        if (Value *V = SimplifyICmpInst(InvEqP, A, B, TD, TLI, DT, MaxRecurse-1))
+        if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse-1))
           return V;
       break;
     }
@@ -2187,7 +2239,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         return V;
       // Otherwise, see if "A EqP B" simplifies.
       if (MaxRecurse)
-        if (Value *V = SimplifyICmpInst(EqP, A, B, TD, TLI, DT, MaxRecurse-1))
+        if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse-1))
           return V;
       break;
     case CmpInst::ICMP_NE:
@@ -2201,7 +2253,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         return V;
       // Otherwise, see if "A InvEqP B" simplifies.
       if (MaxRecurse)
-        if (Value *V = SimplifyICmpInst(InvEqP, A, B, TD, TLI, DT, MaxRecurse-1))
+        if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse-1))
           return V;
       break;
     }
@@ -2283,13 +2335,13 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
-    if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, TD, TLI, DT, MaxRecurse))
+    if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   // If the comparison is with the result of a phi instruction, check whether
   // doing the compare with each incoming phi value yields a common result.
   if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
-    if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, TD, TLI, DT, MaxRecurse))
+    if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   return 0;
@@ -2299,22 +2351,20 @@ Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifyICmpInst(Predicate, LHS, RHS, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyICmpInst(Predicate, LHS, RHS, Query (TD, TLI, DT),
+                            RecursionLimit);
 }
 
 /// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                               const TargetData *TD,
-                               const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT,
-                               unsigned MaxRecurse) {
+                               const Query &Q, unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!");
 
   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
     if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, TD, TLI);
+      return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.TD, Q.TLI);
 
     // If we have a constant, make sure it is on the RHS.
     std::swap(LHS, RHS);
@@ -2382,13 +2432,13 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
-    if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, TD, TLI, DT, MaxRecurse))
+    if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   // If the comparison is with the result of a phi instruction, check whether
   // doing the compare with each incoming phi value yields a common result.
   if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
-    if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, TD, TLI, DT, MaxRecurse))
+    if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
       return V;
 
   return 0;
@@ -2398,13 +2448,15 @@ Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const TargetData *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
-  return ::SimplifyFCmpInst(Predicate, LHS, RHS, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyFCmpInst(Predicate, LHS, RHS, Query (TD, TLI, DT),
+                            RecursionLimit);
 }
 
 /// SimplifySelectInst - Given operands for a SelectInst, see if we can fold
 /// the result.  If not, this returns null.
-Value *llvm::SimplifySelectInst(Value *CondVal, Value *TrueVal, Value *FalseVal,
-                                const TargetData *TD, const DominatorTree *) {
+static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
+                                 Value *FalseVal, const Query &Q,
+                                 unsigned MaxRecurse) {
   // select true, X, Y  -> X
   // select false, X, Y -> Y
   if (ConstantInt *CB = dyn_cast<ConstantInt>(CondVal))
@@ -2427,10 +2479,17 @@ Value *llvm::SimplifySelectInst(Value *CondVal, Value *TrueVal, Value *FalseVal,
   return 0;
 }
 
+Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
+                                const TargetData *TD,
+                                const TargetLibraryInfo *TLI,
+                                const DominatorTree *DT) {
+  return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Query (TD, TLI, DT),
+                              RecursionLimit);
+}
+
 /// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
 /// fold the result.  If not, this returns null.
-Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const TargetData *TD,
-                             const DominatorTree *) {
+static Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const Query &Q, unsigned) {
   // The type of the GEP pointer operand.
   PointerType *PtrTy = dyn_cast<PointerType>(Ops[0]->getType());
   // The GEP pointer operand is not a pointer, it's a vector of pointers.
@@ -2454,9 +2513,9 @@ Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const TargetData *TD,
       if (C->isZero())
         return Ops[0];
     // getelementptr P, N -> P if P points to a type of zero size.
-    if (TD) {
+    if (Q.TD) {
       Type *Ty = PtrTy->getElementType();
-      if (Ty->isSized() && TD->getTypeAllocSize(Ty) == 0)
+      if (Ty->isSized() && Q.TD->getTypeAllocSize(Ty) == 0)
         return Ops[0];
     }
   }
@@ -2469,12 +2528,17 @@ Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const TargetData *TD,
   return ConstantExpr::getGetElementPtr(cast<Constant>(Ops[0]), Ops.slice(1));
 }
 
+Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const TargetData *TD,
+                             const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyGEPInst(Ops, Query (TD, TLI, DT), RecursionLimit);
+}
+
 /// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we
 /// can fold the result.  If not, this returns null.
-Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
-                                     ArrayRef<unsigned> Idxs,
-                                     const TargetData *,
-                                     const DominatorTree *) {
+static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
+                                      ArrayRef<unsigned> Idxs, const Query &Q,
+                                      unsigned) {
   if (Constant *CAgg = dyn_cast<Constant>(Agg))
     if (Constant *CVal = dyn_cast<Constant>(Val))
       return ConstantFoldInsertValueInstruction(CAgg, CVal, Idxs);
@@ -2499,8 +2563,17 @@ Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
   return 0;
 }
 
+Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
+                                     ArrayRef<unsigned> Idxs,
+                                     const TargetData *TD,
+                                     const TargetLibraryInfo *TLI,
+                                     const DominatorTree *DT) {
+  return ::SimplifyInsertValueInst(Agg, Val, Idxs, Query (TD, TLI, DT),
+                                   RecursionLimit);
+}
+
 /// SimplifyPHINode - See if we can fold the given phi.  If not, returns null.
-static Value *SimplifyPHINode(PHINode *PN, const DominatorTree *DT) {
+static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
   // If all of the PHI's incoming values are the same then replace the PHI node
   // with the common value.
   Value *CommonValue = 0;
@@ -2528,81 +2601,77 @@ static Value *SimplifyPHINode(PHINode *PN, const DominatorTree *DT) {
   // instruction, we cannot return X as the result of the PHI node unless it
   // dominates the PHI block.
   if (HasUndefInput)
-    return ValueDominatesPHI(CommonValue, PN, DT) ? CommonValue : 0;
+    return ValueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : 0;
 
   return CommonValue;
 }
 
+static Value *SimplifyTruncInst(Value *Op, Type *Ty, const Query &Q, unsigned) {
+  if (Constant *C = dyn_cast<Constant>(Op))
+    return ConstantFoldInstOperands(Instruction::Trunc, Ty, C, Q.TD, Q.TLI);
+
+  return 0;
+}
+
+Value *llvm::SimplifyTruncInst(Value *Op, Type *Ty, const TargetData *TD,
+                               const TargetLibraryInfo *TLI,
+                               const DominatorTree *DT) {
+  return ::SimplifyTruncInst(Op, Ty, Query (TD, TLI, DT), RecursionLimit);
+}
+
 //=== Helper functions for higher up the class hierarchy.
 
 /// SimplifyBinOp - Given operands for a BinaryOperator, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                            const TargetData *TD, 
-                            const TargetLibraryInfo *TLI,
-                            const DominatorTree *DT,
-                            unsigned MaxRecurse) {
+                            const Query &Q, unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::Add:
     return SimplifyAddInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           TD, TLI, DT, MaxRecurse);
+                           Q, MaxRecurse);
   case Instruction::Sub:
     return SimplifySubInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           TD, TLI, DT, MaxRecurse);
-  case Instruction::Mul:  return SimplifyMulInst (LHS, RHS, TD, TLI, DT,
-                                                  MaxRecurse);
-  case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, TD, TLI, DT,
-                                                  MaxRecurse);
-  case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, TD, TLI, DT,
-                                                  MaxRecurse);
-  case Instruction::FDiv: return SimplifyFDivInst(LHS, RHS, TD, TLI, DT,
-                                                  MaxRecurse);
-  case Instruction::SRem: return SimplifySRemInst(LHS, RHS, TD, TLI, DT,
-                                                  MaxRecurse);
-  case Instruction::URem: return SimplifyURemInst(LHS, RHS, TD, TLI, DT,
-                                                  MaxRecurse);
-  case Instruction::FRem: return SimplifyFRemInst(LHS, RHS, TD, TLI, DT,
-                                                  MaxRecurse);
+                           Q, MaxRecurse);
+  case Instruction::Mul:  return SimplifyMulInst (LHS, RHS, Q, MaxRecurse);
+  case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::FDiv: return SimplifyFDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::SRem: return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::URem: return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::FRem: return SimplifyFRemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::Shl:
     return SimplifyShlInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           TD, TLI, DT, MaxRecurse);
+                           Q, MaxRecurse);
   case Instruction::LShr:
-    return SimplifyLShrInst(LHS, RHS, /*isExact*/false, TD, TLI, DT,
-                            MaxRecurse);
+    return SimplifyLShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
   case Instruction::AShr:
-    return SimplifyAShrInst(LHS, RHS, /*isExact*/false, TD, TLI, DT,
-                            MaxRecurse);
-  case Instruction::And: return SimplifyAndInst(LHS, RHS, TD, TLI, DT,
-                                                MaxRecurse);
-  case Instruction::Or:  return SimplifyOrInst (LHS, RHS, TD, TLI, DT,
-                                                MaxRecurse);
-  case Instruction::Xor: return SimplifyXorInst(LHS, RHS, TD, TLI, DT,
-                                                MaxRecurse);
+    return SimplifyAShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
+  case Instruction::And: return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::Or:  return SimplifyOrInst (LHS, RHS, Q, MaxRecurse);
+  case Instruction::Xor: return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
   default:
     if (Constant *CLHS = dyn_cast<Constant>(LHS))
       if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
         Constant *COps[] = {CLHS, CRHS};
-        return ConstantFoldInstOperands(Opcode, LHS->getType(), COps, TD, TLI);
+        return ConstantFoldInstOperands(Opcode, LHS->getType(), COps, Q.TD,
+                                        Q.TLI);
       }
 
     // If the operation is associative, try some generic simplifications.
     if (Instruction::isAssociative(Opcode))
-      if (Value *V = SimplifyAssociativeBinOp(Opcode, LHS, RHS, TD, TLI, DT,
-                                              MaxRecurse))
+      if (Value *V = SimplifyAssociativeBinOp(Opcode, LHS, RHS, Q, MaxRecurse))
         return V;
 
-    // If the operation is with the result of a select instruction, check whether
+    // If the operation is with the result of a select instruction check whether
     // operating on either branch of the select always yields the same value.
     if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
-      if (Value *V = ThreadBinOpOverSelect(Opcode, LHS, RHS, TD, TLI, DT,
-                                           MaxRecurse))
+      if (Value *V = ThreadBinOpOverSelect(Opcode, LHS, RHS, Q, MaxRecurse))
         return V;
 
     // If the operation is with the result of a phi instruction, check whether
     // operating on all incoming values of the phi always yields the same value.
     if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
-      if (Value *V = ThreadBinOpOverPHI(Opcode, LHS, RHS, TD, TLI, DT,
-                                        MaxRecurse))
+      if (Value *V = ThreadBinOpOverPHI(Opcode, LHS, RHS, Q, MaxRecurse))
         return V;
 
     return 0;
@@ -2612,28 +2681,26 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                            const TargetData *TD, const TargetLibraryInfo *TLI,
                            const DominatorTree *DT) {
-  return ::SimplifyBinOp(Opcode, LHS, RHS, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyBinOp(Opcode, LHS, RHS, Query (TD, TLI, DT), RecursionLimit);
 }
 
 /// SimplifyCmpInst - Given operands for a CmpInst, see if we can
 /// fold the result.
 static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                              const TargetData *TD,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT,
-                              unsigned MaxRecurse) {
+                              const Query &Q, unsigned MaxRecurse) {
   if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
-    return SimplifyICmpInst(Predicate, LHS, RHS, TD, TLI, DT, MaxRecurse);
-  return SimplifyFCmpInst(Predicate, LHS, RHS, TD, TLI, DT, MaxRecurse);
+    return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
+  return SimplifyFCmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
 }
 
 Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                              const TargetData *TD, const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
-  return ::SimplifyCmpInst(Predicate, LHS, RHS, TD, TLI, DT, RecursionLimit);
+  return ::SimplifyCmpInst(Predicate, LHS, RHS, Query (TD, TLI, DT),
+                           RecursionLimit);
 }
 
-static Value *SimplifyCallInst(CallInst *CI) {
+static Value *SimplifyCallInst(CallInst *CI, const Query &) {
   // call undef -> undef
   if (isa<UndefValue>(CI->getCalledValue()))
     return UndefValue::get(CI->getType());
@@ -2720,25 +2787,28 @@ Value *llvm::SimplifyInstruction(Instruction *I, const TargetData *TD,
     break;
   case Instruction::Select:
     Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1),
-                                I->getOperand(2), TD, DT);
+                                I->getOperand(2), TD, TLI, DT);
     break;
   case Instruction::GetElementPtr: {
     SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
-    Result = SimplifyGEPInst(Ops, TD, DT);
+    Result = SimplifyGEPInst(Ops, TD, TLI, DT);
     break;
   }
   case Instruction::InsertValue: {
     InsertValueInst *IV = cast<InsertValueInst>(I);
     Result = SimplifyInsertValueInst(IV->getAggregateOperand(),
                                      IV->getInsertedValueOperand(),
-                                     IV->getIndices(), TD, DT);
+                                     IV->getIndices(), TD, TLI, DT);
     break;
   }
   case Instruction::PHI:
-    Result = SimplifyPHINode(cast<PHINode>(I), DT);
+    Result = SimplifyPHINode(cast<PHINode>(I), Query (TD, TLI, DT));
     break;
   case Instruction::Call:
-    Result = SimplifyCallInst(cast<CallInst>(I));
+    Result = SimplifyCallInst(cast<CallInst>(I), Query (TD, TLI, DT));
+    break;
+  case Instruction::Trunc:
+    Result = SimplifyTruncInst(I->getOperand(0), I->getType(), TD, TLI, DT);
     break;
   }
 
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 279d6a9..5ca2746 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -866,10 +866,11 @@ bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
       // BBFrom to BBTo.
       unsigned NumEdges = 0;
       ConstantInt *EdgeVal = 0;
-      for (unsigned i = 0, e = SI->getNumCases(); i != e; ++i) {
-        if (SI->getCaseSuccessor(i) != BBTo) continue;
+      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+           i != e; ++i) {
+        if (i.getCaseSuccessor() != BBTo) continue;
         if (NumEdges++) break;
-        EdgeVal = SI->getCaseValue(i);
+        EdgeVal = i.getCaseValue();
       }
       assert(EdgeVal && "Missing successor?");
       if (NumEdges == 1) {
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index 0e6bcbf..873a275 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -17,6 +17,7 @@
 #include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Operator.h"
 using namespace llvm;
 
@@ -160,10 +161,15 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
 /// MaxInstsToScan specifies the maximum instructions to scan in the block.  If
 /// it is set to 0, it will scan the whole block. You can also optionally
 /// specify an alias analysis implementation, which makes this more precise.
+///
+/// If TBAATag is non-null and a load or store is found, the TBAA tag from the
+/// load or store is recorded there.  If there is no TBAA tag or if no access
+/// is found, it is left unmodified.
 Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
                                       BasicBlock::iterator &ScanFrom,
                                       unsigned MaxInstsToScan,
-                                      AliasAnalysis *AA) {
+                                      AliasAnalysis *AA,
+                                      MDNode **TBAATag) {
   if (MaxInstsToScan == 0) MaxInstsToScan = ~0U;
 
   // If we're using alias analysis to disambiguate get the size of *Ptr.
@@ -191,15 +197,19 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
     // (This is true even if the load is volatile or atomic, although
     // those cases are unlikely.)
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-      if (AreEquivalentAddressValues(LI->getOperand(0), Ptr))
+      if (AreEquivalentAddressValues(LI->getOperand(0), Ptr)) {
+        if (TBAATag) *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa);
         return LI;
+      }
     
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       // If this is a store through Ptr, the value is available!
       // (This is true even if the store is volatile or atomic, although
       // those cases are unlikely.)
-      if (AreEquivalentAddressValues(SI->getOperand(1), Ptr))
+      if (AreEquivalentAddressValues(SI->getOperand(1), Ptr)) {
+        if (TBAATag) *TBAATag = SI->getMetadata(LLVMContext::MD_tbaa);
         return SI->getOperand(0);
+      }
       
       // If Ptr is an alloca and this is a store to a different alloca, ignore
       // the store.  This is a trivial form of alias analysis that is important
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index ca06300..38cb1c9 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -227,7 +227,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
       return GEP;
 
     // Simplify the GEP to handle 'gep x, 0' -> x etc.
-    if (Value *V = SimplifyGEPInst(GEPOps, TD, DT)) {
+    if (Value *V = SimplifyGEPInst(GEPOps, TD, TLI, DT)) {
       for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
         RemoveInstInputs(GEPOps[i], InstInputs);
 
diff --git a/lib/Analysis/SparsePropagation.cpp b/lib/Analysis/SparsePropagation.cpp
index 0c7d05f..c819666 100644
--- a/lib/Analysis/SparsePropagation.cpp
+++ b/lib/Analysis/SparsePropagation.cpp
@@ -194,9 +194,8 @@ void SparseSolver::getFeasibleSuccessors(TerminatorInst &TI,
     Succs.assign(TI.getNumSuccessors(), true);
     return;
   }
-  
-  unsigned CCase = SI.findCaseValue(cast<ConstantInt>(C));
-  Succs[SI.resolveSuccessorIndex(CCase)] = true;
+  SwitchInst::CaseIt Case = SI.findCaseValue(cast<ConstantInt>(C));
+  Succs[Case.getSuccessorIndex()] = true;
 }
 
 
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index b5811f2..01e00ca 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -41,6 +41,160 @@ static unsigned getBitWidth(Type *Ty, const TargetData *TD) {
   return TD ? TD->getPointerSizeInBits() : 0;
 }
 
+static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
+                                    const APInt &Mask,
+                                    APInt &KnownZero, APInt &KnownOne,
+                                    APInt &KnownZero2, APInt &KnownOne2,
+                                    const TargetData *TD, unsigned Depth) {
+  if (!Add) {
+    if (ConstantInt *CLHS = dyn_cast<ConstantInt>(Op0)) {
+      // We know that the top bits of C-X are clear if X contains less bits
+      // than C (i.e. no wrap-around can happen).  For example, 20-X is
+      // positive if we can prove that X is >= 0 and < 16.
+      if (!CLHS->getValue().isNegative()) {
+        unsigned BitWidth = Mask.getBitWidth();
+        unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
+        // NLZ can't be BitWidth with no sign bit
+        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
+        llvm::ComputeMaskedBits(Op1, MaskV, KnownZero2, KnownOne2, TD, Depth+1);
+    
+        // If all of the MaskV bits are known to be zero, then we know the
+        // output top bits are zero, because we now know that the output is
+        // from [0-C].
+        if ((KnownZero2 & MaskV) == MaskV) {
+          unsigned NLZ2 = CLHS->getValue().countLeadingZeros();
+          // Top bits known zero.
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
+        }
+      }
+    }
+  }
+
+  unsigned BitWidth = Mask.getBitWidth();
+
+  // If one of the operands has trailing zeros, then the bits that the
+  // other operand has in those bit positions will be preserved in the
+  // result. For an add, this works with either operand. For a subtract,
+  // this only works if the known zeros are in the right operand.
+  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+  APInt Mask2 = APInt::getLowBitsSet(BitWidth,
+                                     BitWidth - Mask.countLeadingZeros());
+  llvm::ComputeMaskedBits(Op0, Mask2, LHSKnownZero, LHSKnownOne, TD, Depth+1);
+  assert((LHSKnownZero & LHSKnownOne) == 0 &&
+         "Bits known to be one AND zero?");
+  unsigned LHSKnownZeroOut = LHSKnownZero.countTrailingOnes();
+
+  llvm::ComputeMaskedBits(Op1, Mask2, KnownZero2, KnownOne2, TD, Depth+1);
+  assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+  unsigned RHSKnownZeroOut = KnownZero2.countTrailingOnes();
+
+  // Determine which operand has more trailing zeros, and use that
+  // many bits from the other operand.
+  if (LHSKnownZeroOut > RHSKnownZeroOut) {
+    if (Add) {
+      APInt Mask = APInt::getLowBitsSet(BitWidth, LHSKnownZeroOut);
+      KnownZero |= KnownZero2 & Mask;
+      KnownOne  |= KnownOne2 & Mask;
+    } else {
+      // If the known zeros are in the left operand for a subtract,
+      // fall back to the minimum known zeros in both operands.
+      KnownZero |= APInt::getLowBitsSet(BitWidth,
+                                        std::min(LHSKnownZeroOut,
+                                                 RHSKnownZeroOut));
+    }
+  } else if (RHSKnownZeroOut >= LHSKnownZeroOut) {
+    APInt Mask = APInt::getLowBitsSet(BitWidth, RHSKnownZeroOut);
+    KnownZero |= LHSKnownZero & Mask;
+    KnownOne  |= LHSKnownOne & Mask;
+  }
+
+  // Are we still trying to solve for the sign bit?
+  if (Mask.isNegative() && !KnownZero.isNegative() && !KnownOne.isNegative()) {
+    if (NSW) {
+      if (Add) {
+        // Adding two positive numbers can't wrap into negative
+        if (LHSKnownZero.isNegative() && KnownZero2.isNegative())
+          KnownZero |= APInt::getSignBit(BitWidth);
+        // and adding two negative numbers can't wrap into positive.
+        else if (LHSKnownOne.isNegative() && KnownOne2.isNegative())
+          KnownOne |= APInt::getSignBit(BitWidth);
+      } else {
+        // Subtracting a negative number from a positive one can't wrap
+        if (LHSKnownZero.isNegative() && KnownOne2.isNegative())
+          KnownZero |= APInt::getSignBit(BitWidth);
+        // neither can subtracting a positive number from a negative one.
+        else if (LHSKnownOne.isNegative() && KnownZero2.isNegative())
+          KnownOne |= APInt::getSignBit(BitWidth);
+      }
+    }
+  }
+}
+
+static void ComputeMaskedBitsMul(Value *Op0, Value *Op1, bool NSW,
+                                 const APInt &Mask,
+                                 APInt &KnownZero, APInt &KnownOne,
+                                 APInt &KnownZero2, APInt &KnownOne2,
+                                 const TargetData *TD, unsigned Depth) {
+  unsigned BitWidth = Mask.getBitWidth();
+  APInt Mask2 = APInt::getAllOnesValue(BitWidth);
+  ComputeMaskedBits(Op1, Mask2, KnownZero, KnownOne, TD, Depth+1);
+  ComputeMaskedBits(Op0, Mask2, KnownZero2, KnownOne2, TD, Depth+1);
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+  assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+  bool isKnownNegative = false;
+  bool isKnownNonNegative = false;
+  // If the multiplication is known not to overflow, compute the sign bit.
+  if (Mask.isNegative() && NSW) {
+    if (Op0 == Op1) {
+      // The product of a number with itself is non-negative.
+      isKnownNonNegative = true;
+    } else {
+      bool isKnownNonNegativeOp1 = KnownZero.isNegative();
+      bool isKnownNonNegativeOp0 = KnownZero2.isNegative();
+      bool isKnownNegativeOp1 = KnownOne.isNegative();
+      bool isKnownNegativeOp0 = KnownOne2.isNegative();
+      // The product of two numbers with the same sign is non-negative.
+      isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) ||
+        (isKnownNonNegativeOp1 && isKnownNonNegativeOp0);
+      // The product of a negative number and a non-negative number is either
+      // negative or zero.
+      if (!isKnownNonNegative)
+        isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
+                           isKnownNonZero(Op0, TD, Depth)) ||
+                          (isKnownNegativeOp0 && isKnownNonNegativeOp1 &&
+                           isKnownNonZero(Op1, TD, Depth));
+    }
+  }
+
+  // If low bits are zero in either operand, output low known-0 bits.
+  // Also compute a conserative estimate for high known-0 bits.
+  // More trickiness is possible, but this is sufficient for the
+  // interesting case of alignment computation.
+  KnownOne.clearAllBits();
+  unsigned TrailZ = KnownZero.countTrailingOnes() +
+                    KnownZero2.countTrailingOnes();
+  unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
+                             KnownZero2.countLeadingOnes(),
+                             BitWidth) - BitWidth;
+
+  TrailZ = std::min(TrailZ, BitWidth);
+  LeadZ = std::min(LeadZ, BitWidth);
+  KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
+              APInt::getHighBitsSet(BitWidth, LeadZ);
+  KnownZero &= Mask;
+
+  // Only make use of no-wrap flags if we failed to compute the sign bit
+  // directly.  This matters if the multiplication always overflows, in
+  // which case we prefer to follow the result of the direct computation,
+  // though as the program is invoking undefined behaviour we can choose
+  // whatever we like here.
+  if (isKnownNonNegative && !KnownOne.isNegative())
+    KnownZero.setBit(BitWidth - 1);
+  else if (isKnownNegative && !KnownZero.isNegative())
+    KnownOne.setBit(BitWidth - 1);
+}
+
 /// ComputeMaskedBits - Determine which of the bits specified in Mask are
 /// known to be either zero or one and return them in the KnownZero/KnownOne
 /// bit sets.  This code only analyzes bits in Mask, in order to short-circuit
@@ -106,16 +260,18 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
   // The address of an aligned GlobalValue has trailing zeros.
   if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
     unsigned Align = GV->getAlignment();
-    if (Align == 0 && TD && GV->getType()->getElementType()->isSized()) {
+    if (Align == 0 && TD) {
       if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) {
         Type *ObjectType = GVar->getType()->getElementType();
-        // If the object is defined in the current Module, we'll be giving
-        // it the preferred alignment. Otherwise, we have to assume that it
-        // may only have the minimum ABI alignment.
-        if (!GVar->isDeclaration() && !GVar->isWeakForLinker())
-          Align = TD->getPreferredAlignment(GVar);
-        else
-          Align = TD->getABITypeAlignment(ObjectType);
+        if (ObjectType->isSized()) {
+          // If the object is defined in the current Module, we'll be giving
+          // it the preferred alignment. Otherwise, we have to assume that it
+          // may only have the minimum ABI alignment.
+          if (!GVar->isDeclaration() && !GVar->isWeakForLinker())
+            Align = TD->getPreferredAlignment(GVar);
+          else
+            Align = TD->getABITypeAlignment(ObjectType);
+        }
       }
     }
     if (Align > 0)
@@ -203,68 +359,11 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     return;
   }
   case Instruction::Mul: {
-    APInt Mask2 = APInt::getAllOnesValue(BitWidth);
-    ComputeMaskedBits(I->getOperand(1), Mask2, KnownZero, KnownOne, TD,Depth+1);
-    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, TD,
-                      Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
-
-    bool isKnownNegative = false;
-    bool isKnownNonNegative = false;
-    // If the multiplication is known not to overflow, compute the sign bit.
-    if (Mask.isNegative() &&
-        cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap()) {
-      Value *Op1 = I->getOperand(1), *Op2 = I->getOperand(0);
-      if (Op1 == Op2) {
-        // The product of a number with itself is non-negative.
-        isKnownNonNegative = true;
-      } else {
-        bool isKnownNonNegative1 = KnownZero.isNegative();
-        bool isKnownNonNegative2 = KnownZero2.isNegative();
-        bool isKnownNegative1 = KnownOne.isNegative();
-        bool isKnownNegative2 = KnownOne2.isNegative();
-        // The product of two numbers with the same sign is non-negative.
-        isKnownNonNegative = (isKnownNegative1 && isKnownNegative2) ||
-          (isKnownNonNegative1 && isKnownNonNegative2);
-        // The product of a negative number and a non-negative number is either
-        // negative or zero.
-        if (!isKnownNonNegative)
-          isKnownNegative = (isKnownNegative1 && isKnownNonNegative2 &&
-                             isKnownNonZero(Op2, TD, Depth)) ||
-                            (isKnownNegative2 && isKnownNonNegative1 &&
-                             isKnownNonZero(Op1, TD, Depth));
-      }
-    }
-
-    // If low bits are zero in either operand, output low known-0 bits.
-    // Also compute a conserative estimate for high known-0 bits.
-    // More trickiness is possible, but this is sufficient for the
-    // interesting case of alignment computation.
-    KnownOne.clearAllBits();
-    unsigned TrailZ = KnownZero.countTrailingOnes() +
-                      KnownZero2.countTrailingOnes();
-    unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
-                               KnownZero2.countLeadingOnes(),
-                               BitWidth) - BitWidth;
-
-    TrailZ = std::min(TrailZ, BitWidth);
-    LeadZ = std::min(LeadZ, BitWidth);
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
-                APInt::getHighBitsSet(BitWidth, LeadZ);
-    KnownZero &= Mask;
-
-    // Only make use of no-wrap flags if we failed to compute the sign bit
-    // directly.  This matters if the multiplication always overflows, in
-    // which case we prefer to follow the result of the direct computation,
-    // though as the program is invoking undefined behaviour we can choose
-    // whatever we like here.
-    if (isKnownNonNegative && !KnownOne.isNegative())
-      KnownZero.setBit(BitWidth - 1);
-    else if (isKnownNegative && !KnownZero.isNegative())
-      KnownOne.setBit(BitWidth - 1);
-
-    return;
+    bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+    ComputeMaskedBitsMul(I->getOperand(0), I->getOperand(1), NSW,
+                         Mask, KnownZero, KnownOne, KnownZero2, KnownOne2,
+                         TD, Depth);
+    break;
   }
   case Instruction::UDiv: {
     // For the purposes of computing leading zeros we can conservatively
@@ -422,91 +521,18 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     }
     break;
   case Instruction::Sub: {
-    if (ConstantInt *CLHS = dyn_cast<ConstantInt>(I->getOperand(0))) {
-      // We know that the top bits of C-X are clear if X contains less bits
-      // than C (i.e. no wrap-around can happen).  For example, 20-X is
-      // positive if we can prove that X is >= 0 and < 16.
-      if (!CLHS->getValue().isNegative()) {
-        unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
-        // NLZ can't be BitWidth with no sign bit
-        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        ComputeMaskedBits(I->getOperand(1), MaskV, KnownZero2, KnownOne2,
-                          TD, Depth+1);
-    
-        // If all of the MaskV bits are known to be zero, then we know the
-        // output top bits are zero, because we now know that the output is
-        // from [0-C].
-        if ((KnownZero2 & MaskV) == MaskV) {
-          unsigned NLZ2 = CLHS->getValue().countLeadingZeros();
-          // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
-        }
-      }        
-    }
+    bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+    ComputeMaskedBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
+                            Mask, KnownZero, KnownOne, KnownZero2, KnownOne2,
+                            TD, Depth);
+    break;
   }
-  // fall through
   case Instruction::Add: {
-    // If one of the operands has trailing zeros, then the bits that the
-    // other operand has in those bit positions will be preserved in the
-    // result. For an add, this works with either operand. For a subtract,
-    // this only works if the known zeros are in the right operand.
-    APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-    APInt Mask2 = APInt::getLowBitsSet(BitWidth,
-                                       BitWidth - Mask.countLeadingZeros());
-    ComputeMaskedBits(I->getOperand(0), Mask2, LHSKnownZero, LHSKnownOne, TD,
-                      Depth+1);
-    assert((LHSKnownZero & LHSKnownOne) == 0 &&
-           "Bits known to be one AND zero?");
-    unsigned LHSKnownZeroOut = LHSKnownZero.countTrailingOnes();
-
-    ComputeMaskedBits(I->getOperand(1), Mask2, KnownZero2, KnownOne2, TD, 
-                      Depth+1);
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-    unsigned RHSKnownZeroOut = KnownZero2.countTrailingOnes();
-
-    // Determine which operand has more trailing zeros, and use that
-    // many bits from the other operand.
-    if (LHSKnownZeroOut > RHSKnownZeroOut) {
-      if (I->getOpcode() == Instruction::Add) {
-        APInt Mask = APInt::getLowBitsSet(BitWidth, LHSKnownZeroOut);
-        KnownZero |= KnownZero2 & Mask;
-        KnownOne  |= KnownOne2 & Mask;
-      } else {
-        // If the known zeros are in the left operand for a subtract,
-        // fall back to the minimum known zeros in both operands.
-        KnownZero |= APInt::getLowBitsSet(BitWidth,
-                                          std::min(LHSKnownZeroOut,
-                                                   RHSKnownZeroOut));
-      }
-    } else if (RHSKnownZeroOut >= LHSKnownZeroOut) {
-      APInt Mask = APInt::getLowBitsSet(BitWidth, RHSKnownZeroOut);
-      KnownZero |= LHSKnownZero & Mask;
-      KnownOne  |= LHSKnownOne & Mask;
-    }
-
-    // Are we still trying to solve for the sign bit?
-    if (Mask.isNegative() && !KnownZero.isNegative() && !KnownOne.isNegative()){
-      OverflowingBinaryOperator *OBO = cast<OverflowingBinaryOperator>(I);
-      if (OBO->hasNoSignedWrap()) {
-        if (I->getOpcode() == Instruction::Add) {
-          // Adding two positive numbers can't wrap into negative
-          if (LHSKnownZero.isNegative() && KnownZero2.isNegative())
-            KnownZero |= APInt::getSignBit(BitWidth);
-          // and adding two negative numbers can't wrap into positive.
-          else if (LHSKnownOne.isNegative() && KnownOne2.isNegative())
-            KnownOne |= APInt::getSignBit(BitWidth);
-        } else {
-          // Subtracting a negative number from a positive one can't wrap
-          if (LHSKnownZero.isNegative() && KnownOne2.isNegative())
-            KnownZero |= APInt::getSignBit(BitWidth);
-          // neither can subtracting a positive number from a negative one.
-          else if (LHSKnownOne.isNegative() && KnownZero2.isNegative())
-            KnownOne |= APInt::getSignBit(BitWidth);
-        }
-      }
-    }
-
-    return;
+    bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+    ComputeMaskedBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
+                            Mask, KnownZero, KnownOne, KnownZero2, KnownOne2,
+                            TD, Depth);
+    break;
   }
   case Instruction::SRem:
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
@@ -691,8 +717,8 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
       if (P->hasConstantValue() == P)
         break;
 
-      KnownZero = APInt::getAllOnesValue(BitWidth);
-      KnownOne = APInt::getAllOnesValue(BitWidth);
+      KnownZero = Mask;
+      KnownOne = Mask;
       for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
         // Skip direct self references.
         if (P->getIncomingValue(i) == P) continue;
@@ -723,21 +749,51 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           LowBits -= 1;
-        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        KnownZero = Mask & APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
       case Intrinsic::ctpop: {
         unsigned LowBits = Log2_32(BitWidth)+1;
-        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        KnownZero = Mask & APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
       case Intrinsic::x86_sse42_crc32_64_8:
       case Intrinsic::x86_sse42_crc32_64_64:
-        KnownZero = APInt::getHighBitsSet(64, 32);
+        KnownZero = Mask & APInt::getHighBitsSet(64, 32);
         break;
       }
     }
     break;
+  case Instruction::ExtractValue:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
+      ExtractValueInst *EVI = cast<ExtractValueInst>(I);
+      if (EVI->getNumIndices() != 1) break;
+      if (EVI->getIndices()[0] == 0) {
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::uadd_with_overflow:
+        case Intrinsic::sadd_with_overflow:
+          ComputeMaskedBitsAddSub(true, II->getArgOperand(0),
+                                  II->getArgOperand(1), false, Mask,
+                                  KnownZero, KnownOne, KnownZero2, KnownOne2,
+                                  TD, Depth);
+          break;
+        case Intrinsic::usub_with_overflow:
+        case Intrinsic::ssub_with_overflow:
+          ComputeMaskedBitsAddSub(false, II->getArgOperand(0),
+                                  II->getArgOperand(1), false, Mask,
+                                  KnownZero, KnownOne, KnownZero2, KnownOne2,
+                                  TD, Depth);
+          break;
+        case Intrinsic::umul_with_overflow:
+        case Intrinsic::smul_with_overflow:
+          ComputeMaskedBitsMul(II->getArgOperand(0), II->getArgOperand(1),
+                               false, Mask, KnownZero, KnownOne,
+                               KnownZero2, KnownOne2, TD, Depth);
+          break;
+        }
+      }
+    }
   }
 }
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 9376990..b25d2e9 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1142,9 +1142,10 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
       Vals.push_back(VE.getValueID(SI.getCondition()));
       Vals.push_back(VE.getValueID(SI.getDefaultDest()));
-      for (unsigned i = 0, e = SI.getNumCases(); i != e; ++i) {
-        Vals.push_back(VE.getValueID(SI.getCaseValue(i)));
-        Vals.push_back(VE.getValueID(SI.getCaseSuccessor(i)));
+      for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
+           i != e; ++i) {
+        Vals.push_back(VE.getValueID(i.getCaseValue()));
+        Vals.push_back(VE.getValueID(i.getCaseSuccessor()));
       }
     }
     break;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index d5926f9..dd3fb3b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1864,13 +1864,12 @@ static void EmitGlobalConstantLargeInt(const ConstantInt *CI,
 
 static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
                                    AsmPrinter &AP) {
-  if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV)) {
-    uint64_t Size = AP.TM.getTargetData()->getTypeAllocSize(CV->getType());
+  const TargetData *TD = AP.TM.getTargetData();
+  uint64_t Size = TD->getTypeAllocSize(CV->getType());
+  if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
     return AP.OutStreamer.EmitZeros(Size, AddrSpace);
-  }
 
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
-    unsigned Size = AP.TM.getTargetData()->getTypeAllocSize(CV->getType());
     switch (Size) {
     case 1:
     case 2:
@@ -1891,7 +1890,6 @@ static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
     return EmitGlobalConstantFP(CFP, AddrSpace, AP);
 
   if (isa<ConstantPointerNull>(CV)) {
-    unsigned Size = AP.TM.getTargetData()->getTypeAllocSize(CV->getType());
     AP.OutStreamer.EmitIntValue(0, Size, AddrSpace);
     return;
   }
@@ -1905,20 +1903,28 @@ static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
   if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
     return EmitGlobalConstantStruct(CVS, AddrSpace, AP);
 
-  // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
-  // vectors).
-  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV))
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
+    // vectors).
     if (CE->getOpcode() == Instruction::BitCast)
       return EmitGlobalConstantImpl(CE->getOperand(0), AddrSpace, AP);
+
+    if (Size > 8) {
+      // If the constant expression's size is greater than 64-bits, then we have
+      // to emit the value in chunks. Try to constant fold the value and emit it
+      // that way.
+      Constant *New = ConstantFoldConstantExpression(CE, TD);
+      if (New && New != CE)
+        return EmitGlobalConstantImpl(New, AddrSpace, AP);
+    }
+  }
   
   if (const ConstantVector *V = dyn_cast<ConstantVector>(CV))
     return EmitGlobalConstantVector(V, AddrSpace, AP);
     
   // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
   // thread the streamer with EmitValue.
-  AP.OutStreamer.EmitValue(LowerConstant(CV, AP),
-                         AP.TM.getTargetData()->getTypeAllocSize(CV->getType()),
-                           AddrSpace);
+  AP.OutStreamer.EmitValue(LowerConstant(CV, AP), Size, AddrSpace);
 }
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
@@ -2102,27 +2108,22 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock *MBB) const {
       OutStreamer.EmitLabel(Syms[i]);
   }
 
+  // Print some verbose block comments.
+  if (isVerbose()) {
+    if (const BasicBlock *BB = MBB->getBasicBlock())
+      if (BB->hasName())
+        OutStreamer.AddComment("%" + BB->getName());
+    EmitBasicBlockLoopComments(*MBB, LI, *this);
+  }
+
   // Print the main label for the block.
   if (MBB->pred_empty() || isBlockOnlyReachableByFallthrough(MBB)) {
     if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      if (const BasicBlock *BB = MBB->getBasicBlock())
-        if (BB->hasName())
-          OutStreamer.AddComment("%" + BB->getName());
-
-      EmitBasicBlockLoopComments(*MBB, LI, *this);
-
       // NOTE: Want this comment at start of line, don't emit with AddComment.
       OutStreamer.EmitRawText(Twine(MAI->getCommentString()) + " BB#" +
                               Twine(MBB->getNumber()) + ":");
     }
   } else {
-    if (isVerbose()) {
-      if (const BasicBlock *BB = MBB->getBasicBlock())
-        if (BB->hasName())
-          OutStreamer.AddComment("%" + BB->getName());
-      EmitBasicBlockLoopComments(*MBB, LI, *this);
-    }
-
     OutStreamer.EmitLabel(MBB->getSymbol());
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 51c635e..3b383f6 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -156,13 +156,12 @@ void CompileUnit::addSourceLine(DIE *Die, DISubprogram SP) {
   // Verify subprogram.
   if (!SP.Verify())
     return;
-  // If the line number is 0, don't add it.
-  if (SP.getLineNumber() == 0)
-    return;
 
+  // If the line number is 0, don't add it.
   unsigned Line = SP.getLineNumber();
-  if (!SP.getContext().Verify())
+  if (Line == 0)
     return;
+
   unsigned FileID = DD->GetOrCreateSourceID(SP.getFilename(),
                                             SP.getDirectory());
   assert(FileID && "Invalid file id");
@@ -178,7 +177,7 @@ void CompileUnit::addSourceLine(DIE *Die, DIType Ty) {
     return;
 
   unsigned Line = Ty.getLineNumber();
-  if (Line == 0 || !Ty.getContext().Verify())
+  if (Line == 0)
     return;
   unsigned FileID = DD->GetOrCreateSourceID(Ty.getFilename(),
                                             Ty.getDirectory());
@@ -870,11 +869,6 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     if (CTy.isAppleBlockExtension())
       addUInt(&Buffer, dwarf::DW_AT_APPLE_block, dwarf::DW_FORM_flag, 1);
 
-    unsigned RLang = CTy.getRunTimeLang();
-    if (RLang)
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class,
-              dwarf::DW_FORM_data1, RLang);
-
     DICompositeType ContainingType = CTy.getContainingType();
     if (DIDescriptor(ContainingType).isCompositeType())
       addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
@@ -922,6 +916,12 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // Add source line info if available.
     if (!CTy.isForwardDecl())
       addSourceLine(&Buffer, CTy);
+
+    // No harm in adding the runtime language to the declaration.
+    unsigned RLang = CTy.getRunTimeLang();
+    if (RLang)
+      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class,
+              dwarf::DW_FORM_data1, RLang);
   }
 }
 
@@ -1006,6 +1006,9 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   // Add function template parameters.
   addTemplateParams(*SPDie, SP.getTemplateParams());
 
+  // Unfortunately this code needs to stay here to work around
+  // a bug in older gdbs that requires the linkage name to resolve
+  // multiple template functions.
   StringRef LinkageName = SP.getLinkageName();
   if (!LinkageName.empty())
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index fa62169..388cef4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -523,20 +523,19 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName,
     DirName = "";
 
   unsigned SrcId = SourceIdMap.size()+1;
-  std::pair<std::string, std::string> SourceName =
-      std::make_pair(FileName, DirName);
-  std::pair<std::pair<std::string, std::string>, unsigned> Entry =
-      make_pair(SourceName, SrcId);
 
-  std::map<std::pair<std::string, std::string>, unsigned>::iterator I;
-  bool NewlyInserted;
-  llvm::tie(I, NewlyInserted) = SourceIdMap.insert(Entry);
-  if (!NewlyInserted)
-    return I->second;
+  // We look up the file/dir pair by concatenating them with a zero byte.
+  SmallString<128> NamePair;
+  NamePair += DirName;
+  NamePair += '\0'; // Zero bytes are not allowed in paths.
+  NamePair += FileName;
+
+  StringMapEntry<unsigned> &Ent = SourceIdMap.GetOrCreateValue(NamePair, SrcId);
+  if (Ent.getValue() != SrcId)
+    return Ent.getValue();
 
   // Print out a .file directive to specify files for .loc directives.
-  Asm->OutStreamer.EmitDwarfFileDirective(SrcId, Entry.first.second,
-                                          Entry.first.first);
+  Asm->OutStreamer.EmitDwarfFileDirective(SrcId, DirName, FileName);
 
   return SrcId;
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 8b802d2..83f30f5 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -26,7 +26,6 @@
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DebugLoc.h"
-#include <map>
 
 namespace llvm {
 
@@ -209,9 +208,9 @@ class DwarfDebug {
   ///
   std::vector<DIEAbbrev *> Abbreviations;
 
-  /// SourceIdMap - Source id map, i.e. pair of source filename and directory
-  /// mapped to a unique id.
-  std::map<std::pair<std::string, std::string>, unsigned> SourceIdMap;
+  /// SourceIdMap - Source id map, i.e. pair of source filename and directory,
+  /// separated by a zero byte, mapped to a unique id.
+  StringMap<unsigned> SourceIdMap;
 
   /// StringPool - A String->Symbol mapping of strings used by indirect
   /// references.
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 59c92b3..f57f4a8 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1019,12 +1019,27 @@ static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
   return MBB2I->isCall() && !MBB1I->isCall();
 }
 
+/// getBranchDebugLoc - Find and return, if any, the DebugLoc of the branch
+/// instructions on the block. Always use the DebugLoc of the first
+/// branching instruction found unless its absent, in which case use the
+/// DebugLoc of the second if present.
+static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return DebugLoc();
+  --I;
+  while (I->isDebugValue() && I != MBB.begin())
+    --I;
+  if (I->isBranch())
+    return I->getDebugLoc();
+  return DebugLoc();
+}
+
 /// OptimizeBlock - Analyze and optimize control flow related to the specified
 /// block.  This is never called on the entry block.
 bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
   bool MadeChange = false;
   MachineFunction &MF = *MBB->getParent();
-  DebugLoc dl;  // FIXME: this is nowhere
 ReoptimizeBlock:
 
   MachineFunction::iterator FallThrough = MBB;
@@ -1073,6 +1088,7 @@ ReoptimizeBlock:
     // destination, remove the branch, replacing it with an unconditional one or
     // a fall-through.
     if (PriorTBB && PriorTBB == PriorFBB) {
+      DebugLoc dl = getBranchDebugLoc(PrevBB);
       TII->RemoveBranch(PrevBB);
       PriorCond.clear();
       if (PriorTBB != MBB)
@@ -1130,6 +1146,7 @@ ReoptimizeBlock:
     // If the prior block branches somewhere else on the condition and here if
     // the condition is false, remove the uncond second branch.
     if (PriorFBB == MBB) {
+      DebugLoc dl = getBranchDebugLoc(PrevBB);
       TII->RemoveBranch(PrevBB);
       TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
       MadeChange = true;
@@ -1143,6 +1160,7 @@ ReoptimizeBlock:
     if (PriorTBB == MBB) {
       SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
       if (!TII->ReverseBranchCondition(NewPriorCond)) {
+        DebugLoc dl = getBranchDebugLoc(PrevBB);
         TII->RemoveBranch(PrevBB);
         TII->InsertBranch(PrevBB, PriorFBB, 0, NewPriorCond, dl);
         MadeChange = true;
@@ -1180,6 +1198,7 @@ ReoptimizeBlock:
           DEBUG(dbgs() << "\nMoving MBB: " << *MBB
                        << "To make fallthrough to: " << *PriorTBB << "\n");
 
+          DebugLoc dl = getBranchDebugLoc(PrevBB);
           TII->RemoveBranch(PrevBB);
           TII->InsertBranch(PrevBB, MBB, 0, NewPriorCond, dl);
 
@@ -1209,6 +1228,7 @@ ReoptimizeBlock:
     if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) {
       SmallVector<MachineOperand, 4> NewCond(CurCond);
       if (!TII->ReverseBranchCondition(NewCond)) {
+        DebugLoc dl = getBranchDebugLoc(*MBB);
         TII->RemoveBranch(*MBB);
         TII->InsertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
         MadeChange = true;
@@ -1222,6 +1242,7 @@ ReoptimizeBlock:
     if (CurTBB && CurCond.empty() && CurFBB == 0 &&
         IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
         !MBB->hasAddressTaken()) {
+      DebugLoc dl = getBranchDebugLoc(*MBB);
       // This block may contain just an unconditional branch.  Because there can
       // be 'non-branch terminators' in the block, try removing the branch and
       // then seeing if the block is empty.
@@ -1264,8 +1285,9 @@ ReoptimizeBlock:
               assert(PriorFBB == 0 && "Machine CFG out of date!");
               PriorFBB = MBB;
             }
+            DebugLoc pdl = getBranchDebugLoc(PrevBB);
             TII->RemoveBranch(PrevBB);
-            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, dl);
+            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, pdl);
           }
 
           // Iterate through all the predecessors, revectoring each in-turn.
@@ -1289,9 +1311,10 @@ ReoptimizeBlock:
               bool NewCurUnAnalyzable = TII->AnalyzeBranch(*PMBB, NewCurTBB,
                       NewCurFBB, NewCurCond, true);
               if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
+                DebugLoc pdl = getBranchDebugLoc(*PMBB);
                 TII->RemoveBranch(*PMBB);
                 NewCurCond.clear();
-                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond, dl);
+                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond, pdl);
                 MadeChange = true;
                 ++NumBranchOpts;
                 PMBB->CorrectExtraCFGEdges(NewCurTBB, 0, false);
@@ -1351,7 +1374,7 @@ ReoptimizeBlock:
           if (CurFallsThru) {
             MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
             CurCond.clear();
-            TII->InsertBranch(*MBB, NextBB, 0, CurCond, dl);
+            TII->InsertBranch(*MBB, NextBB, 0, CurCond, DebugLoc());
           }
           MBB->moveAfter(PredBB);
           MadeChange = true;
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 0362365..21729cd 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -80,7 +80,6 @@ add_llvm_library(LLVMCodeGen
   RegisterScavenging.cpp
   RenderMachineFunction.cpp
   ScheduleDAG.cpp
-  ScheduleDAGEmit.cpp
   ScheduleDAGInstrs.cpp
   ScheduleDAGPrinter.cpp
   ScoreboardHazardRecognizer.cpp
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index c684cdc..bad5010 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -35,7 +35,8 @@ CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo &RCI) :
   RegClassInfo(RCI),
   Classes(TRI->getNumRegs(), static_cast<const TargetRegisterClass *>(0)),
   KillIndices(TRI->getNumRegs(), 0),
-  DefIndices(TRI->getNumRegs(), 0) {}
+  DefIndices(TRI->getNumRegs(), 0),
+  KeepRegs(TRI->getNumRegs(), false) {}
 
 CriticalAntiDepBreaker::~CriticalAntiDepBreaker() {
 }
@@ -52,9 +53,9 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   }
 
   // Clear "do not change" set.
-  KeepRegs.clear();
+  KeepRegs.reset();
 
-  bool IsReturnBlock = (!BB->empty() && BB->back().isReturn());
+  bool IsReturnBlock = (BBSize != 0 && BB->back().isReturn());
 
   // Determine the live-out physregs for this block.
   if (IsReturnBlock) {
@@ -63,14 +64,14 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
          E = MRI.liveout_end(); I != E; ++I) {
       unsigned Reg = *I;
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      KillIndices[Reg] = BB->size();
+      KillIndices[Reg] = BBSize;
       DefIndices[Reg] = ~0u;
 
       // Repeat, for all aliases.
       for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
         unsigned AliasReg = *Alias;
         Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-        KillIndices[AliasReg] = BB->size();
+        KillIndices[AliasReg] = BBSize;
         DefIndices[AliasReg] = ~0u;
       }
     }
@@ -85,14 +86,14 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
            E = (*SI)->livein_end(); I != E; ++I) {
       unsigned Reg = *I;
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      KillIndices[Reg] = BB->size();
+      KillIndices[Reg] = BBSize;
       DefIndices[Reg] = ~0u;
 
       // Repeat, for all aliases.
       for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
         unsigned AliasReg = *Alias;
         Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-        KillIndices[AliasReg] = BB->size();
+        KillIndices[AliasReg] = BBSize;
         DefIndices[AliasReg] = ~0u;
       }
     }
@@ -106,14 +107,14 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
     unsigned Reg = *I;
     if (!IsReturnBlock && !Pristine.test(Reg)) continue;
     Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-    KillIndices[Reg] = BB->size();
+    KillIndices[Reg] = BBSize;
     DefIndices[Reg] = ~0u;
 
     // Repeat, for all aliases.
     for (const uint16_t *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
       unsigned AliasReg = *Alias;
       Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
-      KillIndices[AliasReg] = BB->size();
+      KillIndices[AliasReg] = BBSize;
       DefIndices[AliasReg] = ~0u;
     }
   }
@@ -121,7 +122,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
 
 void CriticalAntiDepBreaker::FinishBlock() {
   RegRefs.clear();
-  KeepRegs.clear();
+  KeepRegs.reset();
 }
 
 void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
@@ -233,10 +234,11 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
       RegRefs.insert(std::make_pair(Reg, &MO));
 
     if (MO.isUse() && Special) {
-      if (KeepRegs.insert(Reg)) {
+      if (!KeepRegs.test(Reg)) {
+        KeepRegs.set(Reg);
         for (const uint16_t *Subreg = TRI->getSubRegisters(Reg);
              *Subreg; ++Subreg)
-          KeepRegs.insert(*Subreg);
+          KeepRegs.set(*Subreg);
       }
     }
   }
@@ -259,7 +261,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
           if (MO.clobbersPhysReg(i)) {
             DefIndices[i] = Count;
             KillIndices[i] = ~0u;
-            KeepRegs.erase(i);
+            KeepRegs.reset(i);
             Classes[i] = 0;
             RegRefs.erase(i);
           }
@@ -276,7 +278,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
       assert(((KillIndices[Reg] == ~0u) !=
               (DefIndices[Reg] == ~0u)) &&
              "Kill and Def maps aren't consistent for Reg!");
-      KeepRegs.erase(Reg);
+      KeepRegs.reset(Reg);
       Classes[Reg] = 0;
       RegRefs.erase(Reg);
       // Repeat, for all subregs.
@@ -285,7 +287,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
         unsigned SubregReg = *Subreg;
         DefIndices[SubregReg] = Count;
         KillIndices[SubregReg] = ~0u;
-        KeepRegs.erase(SubregReg);
+        KeepRegs.reset(SubregReg);
         Classes[SubregReg] = 0;
         RegRefs.erase(SubregReg);
       }
@@ -551,7 +553,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
           if (!RegClassInfo.isAllocatable(AntiDepReg))
             // Don't break anti-dependencies on non-allocatable registers.
             AntiDepReg = 0;
-          else if (KeepRegs.count(AntiDepReg))
+          else if (KeepRegs.test(AntiDepReg))
             // Don't break anti-dependencies if an use down below requires
             // this exact register.
             AntiDepReg = 0;
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 0710780..7746259 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallSet.h"
 #include <map>
 
 namespace llvm {
@@ -66,7 +65,7 @@ class TargetRegisterInfo;
 
     /// KeepRegs - A set of registers which are live and cannot be changed to
     /// break anti-dependencies.
-    SmallSet<unsigned, 4> KeepRegs;
+    BitVector KeepRegs;
 
   public:
     CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo&);
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index f0cf290..5ff641c 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -23,10 +23,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 using namespace llvm;
@@ -103,15 +103,12 @@ void DFAPacketizer::reserveResources(llvm::MachineInstr *MI) {
 namespace {
 // DefaultVLIWScheduler - This class extends ScheduleDAGInstrs and overrides
 // Schedule method to build the dependence graph.
-//
-// ScheduleDAGInstrs has LLVM_LIBRARY_VISIBILITY so we have to reference it as
-// an opaque pointer in VLIWPacketizerList.
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
                        MachineDominatorTree &MDT, bool IsPostRA);
   // Schedule - Actual scheduling work.
-  void Schedule();
+  void schedule();
 };
 } // end anonymous namespace
 
@@ -121,9 +118,9 @@ DefaultVLIWScheduler::DefaultVLIWScheduler(
   ScheduleDAGInstrs(MF, MLI, MDT, IsPostRA) {
 }
 
-void DefaultVLIWScheduler::Schedule() {
+void DefaultVLIWScheduler::schedule() {
   // Build the scheduling graph.
-  BuildSchedGraph(0);
+  buildSchedGraph(0);
 }
 
 // VLIWPacketizerList Ctor
@@ -137,7 +134,7 @@ VLIWPacketizerList::VLIWPacketizerList(
 
 // VLIWPacketizerList Dtor
 VLIWPacketizerList::~VLIWPacketizerList() {
-  delete (DefaultVLIWScheduler *)SchedulerImpl;
+  delete SchedulerImpl;
   delete ResourceTracker;
 }
 
@@ -184,18 +181,15 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
 void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator BeginItr,
                                       MachineBasicBlock::iterator EndItr) {
-  DefaultVLIWScheduler *Scheduler = (DefaultVLIWScheduler *)SchedulerImpl;
-  Scheduler->Run(MBB, BeginItr, EndItr, MBB->size());
+  assert(MBB->end() == EndItr && "Bad EndIndex");
 
-  // Remember scheduling units.
-  SUnits = Scheduler->SUnits;
+  SchedulerImpl->enterRegion(MBB, BeginItr, EndItr, MBB->size());
 
-  // Generate MI -> SU map.
-  std::map <MachineInstr*, SUnit*> MIToSUnit;
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    MIToSUnit[SU->getInstr()] = SU;
-  }
+  // Build the DAG without reordering instructions.
+  SchedulerImpl->schedule();
+
+  // Remember scheduling units.
+  SUnits = SchedulerImpl->SUnits;
 
   // The main packetizer loop.
   for (; BeginItr != EndItr; ++BeginItr) {
@@ -211,7 +205,7 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
       continue;
     }
 
-    SUnit *SUI = MIToSUnit[MI];
+    SUnit *SUI = SchedulerImpl->getSUnit(MI);
     assert(SUI && "Missing SUnit Info!");
 
     // Ask DFA if machine resource is available for MI.
@@ -221,7 +215,7 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
       for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
            VE = CurrentPacketMIs.end(); VI != VE; ++VI) {
         MachineInstr *MJ = *VI;
-        SUnit *SUJ = MIToSUnit[MJ];
+        SUnit *SUJ = SchedulerImpl->getSUnit(MJ);
         assert(SUJ && "Missing SUnit Info!");
 
         // Is it legal to packetize SUI and SUJ together.
@@ -245,4 +239,6 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
 
   // End any packet left behind.
   endPacket(MBB, EndItr);
+
+  SchedulerImpl->exitRegion();
 }
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 17633e2..97e6547 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -90,7 +90,7 @@ static void addPassesToHandleExceptions(TargetMachine *TM,
     // removed from the parent invoke(s). This could happen when a landing
     // pad is shared by multiple invokes and is also a target of a normal
     // edge from elsewhere.
-    PM.add(createSjLjEHPass(TM->getTargetLowering()));
+    PM.add(createSjLjEHPreparePass(TM->getTargetLowering()));
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
diff --git a/lib/CodeGen/LatencyPriorityQueue.cpp b/lib/CodeGen/LatencyPriorityQueue.cpp
index 0578229..deab05a 100644
--- a/lib/CodeGen/LatencyPriorityQueue.cpp
+++ b/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -84,11 +84,11 @@ void LatencyPriorityQueue::push(SUnit *SU) {
 }
 
 
-// ScheduledNode - As nodes are scheduled, we look to see if there are any
+// scheduledNode - As nodes are scheduled, we look to see if there are any
 // successor nodes that have a single unscheduled predecessor.  If so, that
 // single predecessor has a higher priority, since scheduling it will make
 // the node available.
-void LatencyPriorityQueue::ScheduledNode(SUnit *SU) {
+void LatencyPriorityQueue::scheduledNode(SUnit *SU) {
   for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
        I != E; ++I) {
     AdjustPriorityOfUnscheduledPreds(I->getSUnit());
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index c35302a..2187833 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -226,7 +226,7 @@ public:
                  LiveInterval *LI, const VNInfo *VNI,
                  SmallVectorImpl<SlotIndex> *Kills,
                  LiveIntervals &LIS, MachineDominatorTree &MDT,
-		 UserValueScopes &UVS);
+                 UserValueScopes &UVS);
 
   /// addDefsFromCopies - The value in LI/LocNo may be copies to other
   /// registers. Determine if any of the copies are available at the kill
@@ -486,7 +486,7 @@ void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
                           LiveInterval *LI, const VNInfo *VNI,
                           SmallVectorImpl<SlotIndex> *Kills,
                           LiveIntervals &LIS, MachineDominatorTree &MDT,
-			  UserValueScopes &UVS) {
+                          UserValueScopes &UVS) {
   SmallVector<SlotIndex, 16> Todo;
   Todo.push_back(Idx);
   do {
@@ -620,7 +620,7 @@ void
 UserValue::computeIntervals(MachineRegisterInfo &MRI,
                             LiveIntervals &LIS,
                             MachineDominatorTree &MDT,
-			    UserValueScopes &UVS) {
+                            UserValueScopes &UVS) {
   SmallVector<std::pair<SlotIndex, unsigned>, 16> Defs;
 
   // Collect all defs to be extended (Skipping undefs).
@@ -841,7 +841,7 @@ bool
 UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
   bool DidChange = false;
   // Split locations referring to OldReg. Iterate backwards so splitLocation can
-  // safely erase unuused locations.
+  // safely erase unused locations.
   for (unsigned i = locations.size(); i ; --i) {
     unsigned LocNo = i-1;
     const MachineOperand *Loc = &locations[LocNo];
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 70ed1c3..3ade660 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -1049,9 +1049,19 @@ public:
     bool hasRegMaskOp = false;
     collectRanges(MI, Entering, Internal, Exiting, hasRegMaskOp, OldIdx);
 
-    moveAllEnteringFrom(OldIdx, Entering);
-    moveAllInternalFrom(OldIdx, Internal);
-    moveAllExitingFrom(OldIdx, Exiting);
+    // To keep the LiveRanges valid within an interval, move the ranges closest
+    // to the destination first. This prevents ranges from overlapping, to that
+    // APIs like removeRange still work.
+    if (NewIdx < OldIdx) {
+      moveAllEnteringFrom(OldIdx, Entering);
+      moveAllInternalFrom(OldIdx, Internal);
+      moveAllExitingFrom(OldIdx, Exiting);
+    }
+    else {
+      moveAllExitingFrom(OldIdx, Exiting);
+      moveAllInternalFrom(OldIdx, Internal);
+      moveAllEnteringFrom(OldIdx, Entering);
+    }
 
     if (hasRegMaskOp)
       updateRegMaskSlots(OldIdx);
@@ -1319,8 +1329,14 @@ private:
   void moveEnteringDownFrom(SlotIndex OldIdx, IntRangePair& P) {
     LiveInterval* LI = P.first;
     LiveRange* LR = P.second;
+    // Extend the LiveRange if NewIdx is past the end.
     if (NewIdx > LR->end) {
-      moveKillFlags(LI->reg, LR->end, NewIdx);
+      // Move kill flags if OldIdx was not originally the end
+      // (otherwise LR->end points to an invalid slot).
+      if (LR->end.getRegSlot() != OldIdx.getRegSlot()) {
+        assert(LR->end > OldIdx && "LiveRange does not cover original slot");
+        moveKillFlags(LI->reg, LR->end, NewIdx);
+      }
       LR->end = NewIdx.getRegSlot();
     }
   }
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 9c3d255..48e1e4c 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -109,6 +109,7 @@ void LiveVariables::MarkVirtRegAliveInBlock(VarInfo& VRInfo,
   // Mark the variable known alive in this bb
   VRInfo.AliveBlocks.set(BBNum);
 
+  assert(MBB != &MF->front() && "Can't find reaching def for virtreg");
   WorkList.insert(WorkList.end(), MBB->pred_rbegin(), MBB->pred_rend());
 }
 
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 611b045..ca8a8e8 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -238,6 +238,18 @@ StringRef MachineBasicBlock::getName() const {
     return "(null)";
 }
 
+/// Return a hopefully unique identifier for this block.
+std::string MachineBasicBlock::getFullName() const {
+  std::string Name;
+  if (getParent())
+    Name = (getParent()->getFunction()->getName() + ":").str();
+  if (getBasicBlock())
+    Name += getBasicBlock()->getName();
+  else
+    Name += (Twine("BB") + Twine(getNumber())).str();
+  return Name;
+}
+
 void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
   const MachineFunction *MF = getParent();
   if (!MF) {
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index e9f9475..43af1ad 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -481,7 +482,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
 /// MCID NULL and no operands.
 MachineInstr::MachineInstr()
   : MCID(0), Flags(0), AsmPrinterFlags(0),
-    MemRefs(0), MemRefsEnd(0),
+    NumMemRefs(0), MemRefs(0),
     Parent(0) {
   // Make sure that we get added to a machine basicblock
   LeakDetector::addGarbageObject(this);
@@ -489,10 +490,10 @@ MachineInstr::MachineInstr()
 
 void MachineInstr::addImplicitDefUseOperands() {
   if (MCID->ImplicitDefs)
-    for (const unsigned *ImpDefs = MCID->ImplicitDefs; *ImpDefs; ++ImpDefs)
+    for (const uint16_t *ImpDefs = MCID->getImplicitDefs(); *ImpDefs; ++ImpDefs)
       addOperand(MachineOperand::CreateReg(*ImpDefs, true, true));
   if (MCID->ImplicitUses)
-    for (const unsigned *ImpUses = MCID->ImplicitUses; *ImpUses; ++ImpUses)
+    for (const uint16_t *ImpUses = MCID->getImplicitUses(); *ImpUses; ++ImpUses)
       addOperand(MachineOperand::CreateReg(*ImpUses, false, true));
 }
 
@@ -501,7 +502,7 @@ void MachineInstr::addImplicitDefUseOperands() {
 /// the MCInstrDesc.
 MachineInstr::MachineInstr(const MCInstrDesc &tid, bool NoImp)
   : MCID(&tid), Flags(0), AsmPrinterFlags(0),
-    MemRefs(0), MemRefsEnd(0), Parent(0) {
+    NumMemRefs(0), MemRefs(0), Parent(0) {
   unsigned NumImplicitOps = 0;
   if (!NoImp)
     NumImplicitOps = MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
@@ -516,7 +517,7 @@ MachineInstr::MachineInstr(const MCInstrDesc &tid, bool NoImp)
 MachineInstr::MachineInstr(const MCInstrDesc &tid, const DebugLoc dl,
                            bool NoImp)
   : MCID(&tid), Flags(0), AsmPrinterFlags(0),
-    MemRefs(0), MemRefsEnd(0), Parent(0), debugLoc(dl) {
+    NumMemRefs(0), MemRefs(0), Parent(0), debugLoc(dl) {
   unsigned NumImplicitOps = 0;
   if (!NoImp)
     NumImplicitOps = MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
@@ -532,7 +533,7 @@ MachineInstr::MachineInstr(const MCInstrDesc &tid, const DebugLoc dl,
 /// basic block.
 MachineInstr::MachineInstr(MachineBasicBlock *MBB, const MCInstrDesc &tid)
   : MCID(&tid), Flags(0), AsmPrinterFlags(0),
-    MemRefs(0), MemRefsEnd(0), Parent(0) {
+    NumMemRefs(0), MemRefs(0), Parent(0) {
   assert(MBB && "Cannot use inserting ctor with null basic block!");
   unsigned NumImplicitOps =
     MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
@@ -548,7 +549,7 @@ MachineInstr::MachineInstr(MachineBasicBlock *MBB, const MCInstrDesc &tid)
 MachineInstr::MachineInstr(MachineBasicBlock *MBB, const DebugLoc dl,
                            const MCInstrDesc &tid)
   : MCID(&tid), Flags(0), AsmPrinterFlags(0),
-    MemRefs(0), MemRefsEnd(0), Parent(0), debugLoc(dl) {
+    NumMemRefs(0), MemRefs(0), Parent(0), debugLoc(dl) {
   assert(MBB && "Cannot use inserting ctor with null basic block!");
   unsigned NumImplicitOps =
     MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
@@ -563,7 +564,7 @@ MachineInstr::MachineInstr(MachineBasicBlock *MBB, const DebugLoc dl,
 ///
 MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
   : MCID(&MI.getDesc()), Flags(0), AsmPrinterFlags(0),
-    MemRefs(MI.MemRefs), MemRefsEnd(MI.MemRefsEnd),
+    NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs),
     Parent(0), debugLoc(MI.getDebugLoc()) {
   Operands.reserve(MI.getNumOperands());
 
@@ -738,28 +739,23 @@ void MachineInstr::RemoveOperand(unsigned OpNo) {
 void MachineInstr::addMemOperand(MachineFunction &MF,
                                  MachineMemOperand *MO) {
   mmo_iterator OldMemRefs = MemRefs;
-  mmo_iterator OldMemRefsEnd = MemRefsEnd;
+  uint16_t OldNumMemRefs = NumMemRefs;
 
-  size_t NewNum = (MemRefsEnd - MemRefs) + 1;
+  uint16_t NewNum = NumMemRefs + 1;
   mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NewNum);
-  mmo_iterator NewMemRefsEnd = NewMemRefs + NewNum;
 
-  std::copy(OldMemRefs, OldMemRefsEnd, NewMemRefs);
+  std::copy(OldMemRefs, OldMemRefs + OldNumMemRefs, NewMemRefs);
   NewMemRefs[NewNum - 1] = MO;
 
   MemRefs = NewMemRefs;
-  MemRefsEnd = NewMemRefsEnd;
+  NumMemRefs = NewNum;
 }
 
-bool
-MachineInstr::hasProperty(unsigned MCFlag, QueryType Type) const {
-  if (Type == IgnoreBundle || !isBundle())
-    return getDesc().getFlags() & (1 << MCFlag);
-
+bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
   const MachineBasicBlock *MBB = getParent();
   MachineBasicBlock::const_instr_iterator MII = *this; ++MII;
   while (MII != MBB->end() && MII->isInsideBundle()) {
-    if (MII->getDesc().getFlags() & (1 << MCFlag)) {
+    if (MII->getDesc().getFlags() & Mask) {
       if (Type == AnyInBundle)
         return true;
     } else {
@@ -1843,49 +1839,55 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
 
 unsigned
 MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
-  unsigned Hash = MI->getOpcode() * 37;
+  // Build up a buffer of hash code components.
+  //
+  // FIXME: This is a total hack. We should have a hash_value overload for
+  // MachineOperand, but currently that doesn't work because there are many
+  // different ideas of "equality" and thus different sets of information that
+  // contribute to the hash code. This one happens to want to take a specific
+  // subset. And it's still not clear that this routine uses the *correct*
+  // subset of information when computing the hash code. The goal is to use the
+  // same inputs for the hash code here that MachineInstr::isIdenticalTo uses to
+  // test for equality when passed the 'IgnoreVRegDefs' filter flag. It would
+  // be very useful to factor the selection of relevant inputs out of the two
+  // functions and into a common routine, but it's not clear how that can be
+  // done.
+  SmallVector<size_t, 8> HashComponents;
+  HashComponents.reserve(MI->getNumOperands() + 1);
+  HashComponents.push_back(MI->getOpcode());
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    uint64_t Key = (uint64_t)MO.getType() << 32;
     switch (MO.getType()) {
     default: break;
     case MachineOperand::MO_Register:
       if (MO.isDef() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
         continue;  // Skip virtual register defs.
-      Key |= MO.getReg();
+      HashComponents.push_back(hash_combine(MO.getType(), MO.getReg()));
       break;
     case MachineOperand::MO_Immediate:
-      Key |= MO.getImm();
+      HashComponents.push_back(hash_combine(MO.getType(), MO.getImm()));
       break;
     case MachineOperand::MO_FrameIndex:
     case MachineOperand::MO_ConstantPoolIndex:
     case MachineOperand::MO_JumpTableIndex:
-      Key |= MO.getIndex();
+      HashComponents.push_back(hash_combine(MO.getType(), MO.getIndex()));
       break;
     case MachineOperand::MO_MachineBasicBlock:
-      Key |= DenseMapInfo<void*>::getHashValue(MO.getMBB());
+      HashComponents.push_back(hash_combine(MO.getType(), MO.getMBB()));
       break;
     case MachineOperand::MO_GlobalAddress:
-      Key |= DenseMapInfo<void*>::getHashValue(MO.getGlobal());
+      HashComponents.push_back(hash_combine(MO.getType(), MO.getGlobal()));
       break;
     case MachineOperand::MO_BlockAddress:
-      Key |= DenseMapInfo<void*>::getHashValue(MO.getBlockAddress());
+      HashComponents.push_back(hash_combine(MO.getType(),
+                                            MO.getBlockAddress()));
       break;
     case MachineOperand::MO_MCSymbol:
-      Key |= DenseMapInfo<void*>::getHashValue(MO.getMCSymbol());
+      HashComponents.push_back(hash_combine(MO.getType(), MO.getMCSymbol()));
       break;
     }
-    Key += ~(Key << 32);
-    Key ^= (Key >> 22);
-    Key += ~(Key << 13);
-    Key ^= (Key >> 8);
-    Key += (Key << 3);
-    Key ^= (Key >> 15);
-    Key += ~(Key << 27);
-    Key ^= (Key >> 31);
-    Hash = (unsigned)Key + Hash * 37;
-  }
-  return Hash;
+  }
+  return hash_combine_range(HashComponents.begin(), HashComponents.end());
 }
 
 void MachineInstr::emitError(StringRef Msg) const {
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index d1f2df9..73489a7 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -229,6 +229,8 @@ bool llvm::finalizeBundles(MachineFunction &MF) {
            "First instr cannot be inside bundle before finalization!");
 
     MachineBasicBlock::instr_iterator MIE = MBB.instr_end();
+    if (MII == MIE)
+      continue;
     for (++MII; MII != MIE; ) {
       if (!MII->isInsideBundle())
         ++MII;
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 7d40e66..f140dec 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -161,9 +161,8 @@ void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
 /// form, so there should only be one definition.
 MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
   // Since we are in SSA form, we can use the first definition.
-  if (!def_empty(Reg))
-    return &*def_begin(Reg);
-  return 0;
+  def_iterator I = def_begin(Reg);
+  return !I.atEnd() ? &*I : 0;
 }
 
 bool MachineRegisterInfo::hasOneUse(unsigned RegNo) const {
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 8a485e0..364a244 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -14,10 +14,10 @@
 
 #define DEBUG_TYPE "misched"
 
-#include "ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachinePassRegistry.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
@@ -25,25 +25,36 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/PriorityQueue.h"
 
 #include <queue>
 
 using namespace llvm;
 
+static cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
+                                  cl::desc("Force top-down list scheduling"));
+static cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
+                                  cl::desc("Force bottom-up list scheduling"));
+
+#ifndef NDEBUG
+static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
+  cl::desc("Pop up a window to show MISched dags after they are processed"));
+
+static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
+  cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
+#else
+static bool ViewMISchedDAGs = false;
+#endif // NDEBUG
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
 
 namespace {
 /// MachineScheduler runs after coalescing and before register allocation.
-class MachineScheduler : public MachineFunctionPass {
+class MachineScheduler : public MachineSchedContext,
+                         public MachineFunctionPass {
 public:
-  MachineFunction *MF;
-  const TargetInstrInfo *TII;
-  const MachineLoopInfo *MLI;
-  const MachineDominatorTree *MDT;
-  LiveIntervals *LIS;
-
   MachineScheduler();
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
@@ -71,7 +82,7 @@ INITIALIZE_PASS_END(MachineScheduler, "misched",
                     "Machine Instruction Scheduler", false, false)
 
 MachineScheduler::MachineScheduler()
-: MachineFunctionPass(ID), MF(0), MLI(0), MDT(0) {
+: MachineFunctionPass(ID) {
   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
 }
 
@@ -80,7 +91,7 @@ void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequiredID(MachineDominatorsID);
   AU.addRequired<MachineLoopInfo>();
   AU.addRequired<AliasAnalysis>();
-  AU.addPreserved<AliasAnalysis>();
+  AU.addRequired<TargetPassConfig>();
   AU.addRequired<SlotIndexes>();
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveIntervals>();
@@ -88,91 +99,226 @@ void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-namespace {
-/// MachineSchedRegistry provides a selection of available machine instruction
-/// schedulers.
-class MachineSchedRegistry : public MachinePassRegistryNode {
-public:
-  typedef ScheduleDAGInstrs *(*ScheduleDAGCtor)(MachineScheduler *);
+MachinePassRegistry MachineSchedRegistry::Registry;
 
-  // RegisterPassParser requires a (misnamed) FunctionPassCtor type.
-  typedef ScheduleDAGCtor FunctionPassCtor;
+/// A dummy default scheduler factory indicates whether the scheduler
+/// is overridden on the command line.
+static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
+  return 0;
+}
 
-  static MachinePassRegistry Registry;
+/// MachineSchedOpt allows command line selection of the scheduler.
+static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
+               RegisterPassParser<MachineSchedRegistry> >
+MachineSchedOpt("misched",
+                cl::init(&useDefaultMachineSched), cl::Hidden,
+                cl::desc("Machine instruction scheduler to use"));
 
-  MachineSchedRegistry(const char *N, const char *D, ScheduleDAGCtor C)
-    : MachinePassRegistryNode(N, D, (MachinePassCtor)C) {
-    Registry.Add(this);
-  }
-  ~MachineSchedRegistry() { Registry.Remove(this); }
+static MachineSchedRegistry
+DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
+                     useDefaultMachineSched);
+
+/// Forward declare the standard machine scheduler. This will be used as the
+/// default scheduler if the target does not set a default.
+static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
+
+/// Top-level MachineScheduler pass driver.
+///
+/// Visit blocks in function order. Divide each block into scheduling regions
+/// and visit them bottom-up. Visiting regions bottom-up is not required, but is
+/// consistent with the DAG builder, which traverses the interior of the
+/// scheduling regions bottom-up.
+///
+/// This design avoids exposing scheduling boundaries to the DAG builder,
+/// simplifying the DAG builder's support for "special" target instructions.
+/// At the same time the design allows target schedulers to operate across
+/// scheduling boundaries, for example to bundle the boudary instructions
+/// without reordering them. This creates complexity, because the target
+/// scheduler must update the RegionBegin and RegionEnd positions cached by
+/// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
+/// design would be to split blocks at scheduling boundaries, but LLVM has a
+/// general bias against block splitting purely for implementation simplicity.
+bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+  // Initialize the context of the pass.
+  MF = &mf;
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  PassConfig = &getAnalysis<TargetPassConfig>();
+  AA = &getAnalysis<AliasAnalysis>();
 
-  // Accessors.
-  //
-  MachineSchedRegistry *getNext() const {
-    return (MachineSchedRegistry *)MachinePassRegistryNode::getNext();
-  }
-  static MachineSchedRegistry *getList() {
-    return (MachineSchedRegistry *)Registry.getList();
-  }
-  static ScheduleDAGCtor getDefault() {
-    return (ScheduleDAGCtor)Registry.getDefault();
-  }
-  static void setDefault(ScheduleDAGCtor C) {
-    Registry.setDefault((MachinePassCtor)C);
+  LIS = &getAnalysis<LiveIntervals>();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+
+  // Select the scheduler, or set the default.
+  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
+  if (Ctor == useDefaultMachineSched) {
+    // Get the default scheduler set by the target.
+    Ctor = MachineSchedRegistry::getDefault();
+    if (!Ctor) {
+      Ctor = createConvergingSched;
+      MachineSchedRegistry::setDefault(Ctor);
+    }
   }
-  static void setListener(MachinePassRegistryListener *L) {
-    Registry.setListener(L);
+  // Instantiate the selected scheduler.
+  OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
+
+  // Visit all machine basic blocks.
+  for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
+       MBB != MBBEnd; ++MBB) {
+
+    Scheduler->startBlock(MBB);
+
+    // Break the block into scheduling regions [I, RegionEnd), and schedule each
+    // region as soon as it is discovered. RegionEnd points the the scheduling
+    // boundary at the bottom of the region. The DAG does not include RegionEnd,
+    // but the region does (i.e. the next RegionEnd is above the previous
+    // RegionBegin). If the current block has no terminator then RegionEnd ==
+    // MBB->end() for the bottom region.
+    //
+    // The Scheduler may insert instructions during either schedule() or
+    // exitRegion(), even for empty regions. So the local iterators 'I' and
+    // 'RegionEnd' are invalid across these calls.
+    unsigned RemainingCount = MBB->size();
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) {
+      // Avoid decrementing RegionEnd for blocks with no terminator.
+      if (RegionEnd != MBB->end()
+          || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) {
+        --RegionEnd;
+        // Count the boundary instruction.
+        --RemainingCount;
+      }
+
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      MachineBasicBlock::iterator I = RegionEnd;
+      for(;I != MBB->begin(); --I, --RemainingCount) {
+        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
+          break;
+      }
+      // Notify the scheduler of the region, even if we may skip scheduling
+      // it. Perhaps it still needs to be bundled.
+      Scheduler->enterRegion(MBB, I, RegionEnd, RemainingCount);
+
+      // Skip empty scheduling regions (0 or 1 schedulable instructions).
+      if (I == RegionEnd || I == llvm::prior(RegionEnd)) {
+        // Close the current region. Bundle the terminator if needed.
+        // This invalidates 'RegionEnd' and 'I'.
+        Scheduler->exitRegion();
+        continue;
+      }
+      DEBUG(dbgs() << "MachineScheduling " << MF->getFunction()->getName()
+            << ":BB#" << MBB->getNumber() << "\n  From: " << *I << "    To: ";
+            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+            else dbgs() << "End";
+            dbgs() << " Remaining: " << RemainingCount << "\n");
+
+      // Schedule a region: possibly reorder instructions.
+      // This invalidates 'RegionEnd' and 'I'.
+      Scheduler->schedule();
+
+      // Close the current region.
+      Scheduler->exitRegion();
+
+      // Scheduling has invalidated the current iterator 'I'. Ask the
+      // scheduler for the top of it's scheduled region.
+      RegionEnd = Scheduler->begin();
+    }
+    assert(RemainingCount == 0 && "Instruction count mismatch!");
+    Scheduler->finishBlock();
   }
-};
-} // namespace
+  DEBUG(LIS->print(dbgs()));
+  return true;
+}
 
-MachinePassRegistry MachineSchedRegistry::Registry;
+void MachineScheduler::print(raw_ostream &O, const Module* m) const {
+  // unimplemented
+}
 
-static ScheduleDAGInstrs *createDefaultMachineSched(MachineScheduler *P);
+//===----------------------------------------------------------------------===//
+// MachineSchedStrategy - Interface to a machine scheduling algorithm.
+//===----------------------------------------------------------------------===//
 
-/// MachineSchedOpt allows command line selection of the scheduler.
-static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
-               RegisterPassParser<MachineSchedRegistry> >
-MachineSchedOpt("misched",
-                cl::init(&createDefaultMachineSched), cl::Hidden,
-                cl::desc("Machine instruction scheduler to use"));
+namespace {
+class ScheduleDAGMI;
+
+/// MachineSchedStrategy - Interface used by ScheduleDAGMI to drive the selected
+/// scheduling algorithm.
+///
+/// If this works well and targets wish to reuse ScheduleDAGMI, we may expose it
+/// in ScheduleDAGInstrs.h
+class MachineSchedStrategy {
+public:
+  virtual ~MachineSchedStrategy() {}
+
+  /// Initialize the strategy after building the DAG for a new region.
+  virtual void initialize(ScheduleDAGMI *DAG) = 0;
+
+  /// Pick the next node to schedule, or return NULL. Set IsTopNode to true to
+  /// schedule the node at the top of the unscheduled region. Otherwise it will
+  /// be scheduled at the bottom.
+  virtual SUnit *pickNode(bool &IsTopNode) = 0;
+
+  /// When all predecessor dependencies have been resolved, free this node for
+  /// top-down scheduling.
+  virtual void releaseTopNode(SUnit *SU) = 0;
+  /// When all successor dependencies have been resolved, free this node for
+  /// bottom-up scheduling.
+  virtual void releaseBottomNode(SUnit *SU) = 0;
+};
+} // namespace
 
 //===----------------------------------------------------------------------===//
-// Machine Instruction Scheduling Common Implementation
+// ScheduleDAGMI - Base class for MachineInstr scheduling with LiveIntervals
+// preservation.
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// ScheduleTopDownLive is an implementation of ScheduleDAGInstrs that schedules
+/// ScheduleDAGMI is an implementation of ScheduleDAGInstrs that schedules
 /// machine instructions while updating LiveIntervals.
-class ScheduleTopDownLive : public ScheduleDAGInstrs {
-protected:
-  MachineScheduler *Pass;
+class ScheduleDAGMI : public ScheduleDAGInstrs {
+  AliasAnalysis *AA;
+  MachineSchedStrategy *SchedImpl;
+
+  /// The top of the unscheduled zone.
+  MachineBasicBlock::iterator CurrentTop;
+
+  /// The bottom of the unscheduled zone.
+  MachineBasicBlock::iterator CurrentBottom;
+
+  /// The number of instructions scheduled so far. Used to cut off the
+  /// scheduler at the point determined by misched-cutoff.
+  unsigned NumInstrsScheduled;
 public:
-  ScheduleTopDownLive(MachineScheduler *P):
-    ScheduleDAGInstrs(*P->MF, *P->MLI, *P->MDT, /*IsPostRA=*/false, P->LIS),
-    Pass(P) {}
+  ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S):
+    ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
+    AA(C->AA), SchedImpl(S), CurrentTop(), CurrentBottom(),
+    NumInstrsScheduled(0) {}
 
-  /// ScheduleDAGInstrs callback.
-  void Schedule();
+  ~ScheduleDAGMI() {
+    delete SchedImpl;
+  }
 
-  /// Interface implemented by the selected top-down liveinterval scheduler.
-  ///
-  /// Pick the next node to schedule, or return NULL.
-  virtual SUnit *pickNode() = 0;
+  MachineBasicBlock::iterator top() const { return CurrentTop; }
+  MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
 
-  /// When all preceeding dependencies have been resolved, free this node for
-  /// scheduling.
-  virtual void releaseNode(SUnit *SU) = 0;
+  /// Implement ScheduleDAGInstrs interface.
+  void schedule();
 
 protected:
+  void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
+  bool checkSchedLimit();
+
   void releaseSucc(SUnit *SU, SDep *SuccEdge);
   void releaseSuccessors(SUnit *SU);
+  void releasePred(SUnit *SU, SDep *PredEdge);
+  void releasePredecessors(SUnit *SU);
 };
 } // namespace
 
 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
 /// NumPredsLeft reaches zero, release the successor node.
-void ScheduleTopDownLive::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
   SUnit *SuccSU = SuccEdge->getSUnit();
 
 #ifndef NDEBUG
@@ -185,164 +331,199 @@ void ScheduleTopDownLive::releaseSucc(SUnit *SU, SDep *SuccEdge) {
 #endif
   --SuccSU->NumPredsLeft;
   if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
-    releaseNode(SuccSU);
+    SchedImpl->releaseTopNode(SuccSU);
 }
 
 /// releaseSuccessors - Call releaseSucc on each of SU's successors.
-void ScheduleTopDownLive::releaseSuccessors(SUnit *SU) {
+void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
   for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
        I != E; ++I) {
     releaseSucc(SU, &*I);
   }
 }
 
-/// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
-/// time to do some work.
-void ScheduleTopDownLive::Schedule() {
-  BuildSchedGraph(&Pass->getAnalysis<AliasAnalysis>());
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
+/// NumSuccsLeft reaches zero, release the predecessor node.
+void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
+    SchedImpl->releaseBottomNode(PredSU);
+}
+
+/// releasePredecessors - Call releasePred on each of SU's predecessors.
+void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    releasePred(SU, &*I);
+  }
+}
+
+void ScheduleDAGMI::moveInstruction(MachineInstr *MI,
+                                    MachineBasicBlock::iterator InsertPos) {
+  // Fix RegionBegin if the first instruction moves down.
+  if (&*RegionBegin == MI)
+    RegionBegin = llvm::next(RegionBegin);
+  BB->splice(InsertPos, BB, MI);
+  LIS->handleMove(MI);
+  // Fix RegionBegin if another instruction moves above the first instruction.
+  if (RegionBegin == InsertPos)
+    RegionBegin = MI;
+}
+
+bool ScheduleDAGMI::checkSchedLimit() {
+#ifndef NDEBUG
+  if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
+    CurrentTop = CurrentBottom;
+    return false;
+  }
+  ++NumInstrsScheduled;
+#endif
+  return true;
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region.
+void ScheduleDAGMI::schedule() {
+  buildSchedGraph(AA);
 
   DEBUG(dbgs() << "********** MI Scheduling **********\n");
   DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           SUnits[su].dumpAll(this));
 
-  // Release any successors of the special Entry node. It is currently unused,
-  // but we keep up appearances.
+  if (ViewMISchedDAGs) viewGraph();
+
+  SchedImpl->initialize(this);
+
+  // Release edges from the special Entry node or to the special Exit node.
   releaseSuccessors(&EntrySU);
+  releasePredecessors(&ExitSU);
 
   // Release all DAG roots for scheduling.
   for (std::vector<SUnit>::iterator I = SUnits.begin(), E = SUnits.end();
        I != E; ++I) {
-    // A SUnit is ready to schedule if it has no predecessors.
+    // A SUnit is ready to top schedule if it has no predecessors.
     if (I->Preds.empty())
-      releaseNode(&(*I));
+      SchedImpl->releaseTopNode(&(*I));
+    // A SUnit is ready to bottom schedule if it has no successors.
+    if (I->Succs.empty())
+      SchedImpl->releaseBottomNode(&(*I));
   }
 
-  InsertPos = Begin;
-  while (SUnit *SU = pickNode()) {
-    DEBUG(dbgs() << "*** Scheduling Instruction:\n"; SU->dump(this));
+  CurrentTop = RegionBegin;
+  CurrentBottom = RegionEnd;
+  bool IsTopNode = false;
+  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+    DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+          << " Scheduling Instruction:\n"; SU->dump(this));
+    if (!checkSchedLimit())
+      break;
 
     // Move the instruction to its new location in the instruction stream.
     MachineInstr *MI = SU->getInstr();
-    if (&*InsertPos == MI)
-      ++InsertPos;
+
+    if (IsTopNode) {
+      assert(SU->isTopReady() && "node still has unscheduled dependencies");
+      if (&*CurrentTop == MI)
+        ++CurrentTop;
+      else
+        moveInstruction(MI, CurrentTop);
+      // Release dependent instructions for scheduling.
+      releaseSuccessors(SU);
+    }
     else {
-      BB->splice(InsertPos, BB, MI);
-      Pass->LIS->handleMove(MI);
-      if (Begin == InsertPos)
-        Begin = MI;
+      assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+      if (&*llvm::prior(CurrentBottom) == MI)
+        --CurrentBottom;
+      else {
+        if (&*CurrentTop == MI)
+          CurrentTop = llvm::next(CurrentTop);
+        moveInstruction(MI, CurrentBottom);
+        CurrentBottom = MI;
+      }
+      // Release dependent instructions for scheduling.
+      releasePredecessors(SU);
     }
-
-    // Release dependent instructions for scheduling.
-    releaseSuccessors(SU);
+    SU->isScheduled = true;
   }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 }
 
-bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
-  // Initialize the context of the pass.
-  MF = &mf;
-  MLI = &getAnalysis<MachineLoopInfo>();
-  MDT = &getAnalysis<MachineDominatorTree>();
-  LIS = &getAnalysis<LiveIntervals>();
-  TII = MF->getTarget().getInstrInfo();
+//===----------------------------------------------------------------------===//
+// ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
 
-  // Select the scheduler, or set the default.
-  MachineSchedRegistry::ScheduleDAGCtor Ctor =
-    MachineSchedRegistry::getDefault();
-  if (!Ctor) {
-    Ctor = MachineSchedOpt;
-    MachineSchedRegistry::setDefault(Ctor);
-  }
-  // Instantiate the selected scheduler.
-  OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
+namespace {
+/// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
+/// the schedule.
+class ConvergingScheduler : public MachineSchedStrategy {
+  ScheduleDAGMI *DAG;
 
-  // Visit all machine basic blocks.
-  for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
-       MBB != MBBEnd; ++MBB) {
+  unsigned NumTopReady;
+  unsigned NumBottomReady;
 
-    // Break the block into scheduling regions [I, RegionEnd), and schedule each
-    // region as soon as it is discovered.
-    unsigned RemainingCount = MBB->size();
-    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
-        RegionEnd != MBB->begin();) {
-      // The next region starts above the previous region. Look backward in the
-      // instruction stream until we find the nearest boundary.
-      MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingCount) {
-        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
-          break;
-      }
-      if (I == RegionEnd) {
-        // Skip empty scheduling regions.
-        RegionEnd = llvm::prior(RegionEnd);
-        --RemainingCount;
-        continue;
-      }
-      // Skip regions with one instruction.
-      if (I == llvm::prior(RegionEnd)) {
-        RegionEnd = llvm::prior(RegionEnd);
-        continue;
-      }
-      DEBUG(dbgs() << "MachineScheduling " << MF->getFunction()->getName()
-            << ":BB#" << MBB->getNumber() << "\n  From: " << *I << "    To: ";
-            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-            else dbgs() << "End";
-            dbgs() << " Remaining: " << RemainingCount << "\n");
+public:
+  virtual void initialize(ScheduleDAGMI *dag) {
+    DAG = dag;
 
-      // Inform ScheduleDAGInstrs of the region being scheduled. It calls back
-      // to our Schedule() method.
-      Scheduler->Run(MBB, I, RegionEnd, MBB->size());
-      RegionEnd = Scheduler->Begin;
-    }
-    assert(RemainingCount == 0 && "Instruction count mismatch!");
+    assert((!ForceTopDown || !ForceBottomUp) &&
+           "-misched-topdown incompatible with -misched-bottomup");
   }
-  return true;
-}
 
-void MachineScheduler::print(raw_ostream &O, const Module* m) const {
-  // unimplemented
-}
+  virtual SUnit *pickNode(bool &IsTopNode) {
+    if (DAG->top() == DAG->bottom())
+      return NULL;
 
-//===----------------------------------------------------------------------===//
-// Placeholder for extending the machine instruction scheduler.
-//===----------------------------------------------------------------------===//
-
-namespace {
-class DefaultMachineScheduler : public ScheduleDAGInstrs {
-  MachineScheduler *Pass;
-public:
-  DefaultMachineScheduler(MachineScheduler *P):
-    ScheduleDAGInstrs(*P->MF, *P->MLI, *P->MDT, /*IsPostRA=*/false, P->LIS),
-    Pass(P) {}
+    // As an initial placeholder heuristic, schedule in the direction that has
+    // the fewest choices.
+    SUnit *SU;
+    if (ForceTopDown || (!ForceBottomUp && NumTopReady <= NumBottomReady)) {
+      SU = DAG->getSUnit(DAG->top());
+      IsTopNode = true;
+    }
+    else {
+      SU = DAG->getSUnit(llvm::prior(DAG->bottom()));
+      IsTopNode = false;
+    }
+    if (SU->isTopReady()) {
+      assert(NumTopReady > 0 && "bad ready count");
+      --NumTopReady;
+    }
+    if (SU->isBottomReady()) {
+      assert(NumBottomReady > 0 && "bad ready count");
+      --NumBottomReady;
+    }
+    return SU;
+  }
 
-  /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
-  /// time to do some work.
-  void Schedule();
+  virtual void releaseTopNode(SUnit *SU) {
+    ++NumTopReady;
+  }
+  virtual void releaseBottomNode(SUnit *SU) {
+    ++NumBottomReady;
+  }
 };
 } // namespace
 
-static ScheduleDAGInstrs *createDefaultMachineSched(MachineScheduler *P) {
-  return new DefaultMachineScheduler(P);
+/// Create the standard converging machine scheduler. This will be used as the
+/// default scheduler if the target does not set a default.
+static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  return new ScheduleDAGMI(C, new ConvergingScheduler());
 }
 static MachineSchedRegistry
-SchedDefaultRegistry("default", "Activate the scheduler pass, "
-                     "but don't reorder instructions",
-                     createDefaultMachineSched);
-
-
-/// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
-/// time to do some work.
-void DefaultMachineScheduler::Schedule() {
-  BuildSchedGraph(&Pass->getAnalysis<AliasAnalysis>());
-
-  DEBUG(dbgs() << "********** MI Scheduling **********\n");
-  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          SUnits[su].dumpAll(this));
-
-  // TODO: Put interesting things here.
-  //
-  // When this is fully implemented, it will become a subclass of
-  // ScheduleTopDownLive. So this driver will disappear.
-}
+ConvergingSchedRegistry("converge", "Standard converging scheduler.",
+                        createConvergingSched);
 
 //===----------------------------------------------------------------------===//
 // Machine Instruction Shuffler for Correctness Testing
@@ -350,43 +531,83 @@ void DefaultMachineScheduler::Schedule() {
 
 #ifndef NDEBUG
 namespace {
-// Nodes with a higher number have higher priority. This way we attempt to
-// schedule the latest instructions earliest.
-//
-// TODO: Relies on the property of the BuildSchedGraph that results in SUnits
-// being ordered in sequence top-down.
-struct ShuffleSUnitOrder {
+/// Apply a less-than relation on the node order, which corresponds to the
+/// instruction order prior to scheduling. IsReverse implements greater-than.
+template<bool IsReverse>
+struct SUnitOrder {
   bool operator()(SUnit *A, SUnit *B) const {
-    return A->NodeNum < B->NodeNum;
+    if (IsReverse)
+      return A->NodeNum > B->NodeNum;
+    else
+      return A->NodeNum < B->NodeNum;
   }
 };
 
 /// Reorder instructions as much as possible.
-class InstructionShuffler : public ScheduleTopDownLive {
-  std::priority_queue<SUnit*, std::vector<SUnit*>, ShuffleSUnitOrder> Queue;
+class InstructionShuffler : public MachineSchedStrategy {
+  bool IsAlternating;
+  bool IsTopDown;
+
+  // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
+  // gives nodes with a higher number higher priority causing the latest
+  // instructions to be scheduled first.
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> >
+    TopQ;
+  // When scheduling bottom-up, use greater-than as the queue priority.
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> >
+    BottomQ;
 public:
-  InstructionShuffler(MachineScheduler *P):
-    ScheduleTopDownLive(P) {}
+  InstructionShuffler(bool alternate, bool topdown)
+    : IsAlternating(alternate), IsTopDown(topdown) {}
 
-  /// ScheduleTopDownLive Interface
+  virtual void initialize(ScheduleDAGMI *) {
+    TopQ.clear();
+    BottomQ.clear();
+  }
 
-  virtual SUnit *pickNode() {
-    if (Queue.empty()) return NULL;
-    SUnit *SU = Queue.top();
-    Queue.pop();
+  /// Implement MachineSchedStrategy interface.
+  /// -----------------------------------------
+
+  virtual SUnit *pickNode(bool &IsTopNode) {
+    SUnit *SU;
+    if (IsTopDown) {
+      do {
+        if (TopQ.empty()) return NULL;
+        SU = TopQ.top();
+        TopQ.pop();
+      } while (SU->isScheduled);
+      IsTopNode = true;
+    }
+    else {
+      do {
+        if (BottomQ.empty()) return NULL;
+        SU = BottomQ.top();
+        BottomQ.pop();
+      } while (SU->isScheduled);
+      IsTopNode = false;
+    }
+    if (IsAlternating)
+      IsTopDown = !IsTopDown;
     return SU;
   }
 
-  virtual void releaseNode(SUnit *SU) {
-    Queue.push(SU);
+  virtual void releaseTopNode(SUnit *SU) {
+    TopQ.push(SU);
+  }
+  virtual void releaseBottomNode(SUnit *SU) {
+    BottomQ.push(SU);
   }
 };
 } // namespace
 
-static ScheduleDAGInstrs *createInstructionShuffler(MachineScheduler *P) {
-  return new InstructionShuffler(P);
+static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
+  bool Alternate = !ForceTopDown && !ForceBottomUp;
+  bool TopDown = !ForceBottomUp;
+  assert((TopDown || !ForceTopDown) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  return new ScheduleDAGMI(C, new InstructionShuffler(Alternate, TopDown));
 }
-static MachineSchedRegistry ShufflerRegistry("shuffle",
-                                             "Shuffle machine instructions",
-                                             createInstructionShuffler);
+static MachineSchedRegistry ShufflerRegistry(
+  "shuffle", "Shuffle machine instructions alternating directions",
+  createInstructionShuffler);
 #endif // !NDEBUG
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 394a960..830a876 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -900,7 +900,7 @@ MachineVerifier::visitMachineBasicBlockAfter(const MachineBasicBlock *MBB) {
 void MachineVerifier::calcRegsPassed() {
   // First push live-out regs to successors' vregsPassed. Remember the MBBs that
   // have any vregsPassed.
-  DenseSet<const MachineBasicBlock*> todo;
+  SmallPtrSet<const MachineBasicBlock*, 8> todo;
   for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
        MFI != MFE; ++MFI) {
     const MachineBasicBlock &MBB(*MFI);
@@ -937,7 +937,7 @@ void MachineVerifier::calcRegsPassed() {
 // similar to calcRegsPassed, only backwards.
 void MachineVerifier::calcRegsRequired() {
   // First push live-in regs to predecessors' vregsRequired.
-  DenseSet<const MachineBasicBlock*> todo;
+  SmallPtrSet<const MachineBasicBlock*, 8> todo;
   for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
        MFI != MFE; ++MFI) {
     const MachineBasicBlock &MBB(*MFI);
@@ -970,9 +970,10 @@ void MachineVerifier::calcRegsRequired() {
 // Check PHI instructions at the beginning of MBB. It is assumed that
 // calcRegsPassed has been run so BBInfo::isLiveOut is valid.
 void MachineVerifier::checkPHIOps(const MachineBasicBlock *MBB) {
+  SmallPtrSet<const MachineBasicBlock*, 8> seen;
   for (MachineBasicBlock::const_iterator BBI = MBB->begin(), BBE = MBB->end();
        BBI != BBE && BBI->isPHI(); ++BBI) {
-    DenseSet<const MachineBasicBlock*> seen;
+    seen.clear();
 
     for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
       unsigned Reg = BBI->getOperand(i).getReg();
@@ -1013,8 +1014,17 @@ void MachineVerifier::visitMachineFunctionAfter() {
   }
 
   // Now check liveness info if available
-  if (LiveVars || LiveInts)
-    calcRegsRequired();
+  calcRegsRequired();
+
+  if (MRI->isSSA() && !MF->empty()) {
+    BBInfo &MInfo = MBBInfoMap[&MF->front()];
+    for (RegSet::iterator
+         I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
+         ++I)
+      report("Virtual register def doesn't dominate all uses.",
+             MRI->getVRegDef(*I));
+  }
+
   if (LiveVars)
     verifyLiveVariables();
   if (LiveInts)
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index ec1f2b4..6246c21 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -564,7 +564,8 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   addPass(RegisterCoalescerID);
 
   // PreRA instruction scheduling.
-  addPass(MachineSchedulerID);
+  if (addPass(MachineSchedulerID) != &NoPassID)
+    printAndVerify("After Machine Scheduling");
 
   // Add the selected register allocation pass.
   PM.add(RegAllocPass);
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index e59aa9d..24d3e5a 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -23,7 +23,6 @@
 #include "AggressiveAntiDepBreaker.h"
 #include "CriticalAntiDepBreaker.h"
 #include "RegisterClassInfo.h"
-#include "ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
@@ -32,6 +31,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Target/TargetLowering.h"
@@ -127,6 +127,9 @@ namespace {
     /// LiveRegs - true if the register is live.
     BitVector LiveRegs;
 
+    /// The schedule. Null SUnit*'s represent noop instructions.
+    std::vector<SUnit*> Sequence;
+
   public:
     SchedulePostRATDList(
       MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
@@ -136,23 +139,34 @@ namespace {
 
     ~SchedulePostRATDList();
 
-    /// StartBlock - Initialize register live-range state for scheduling in
+    /// startBlock - Initialize register live-range state for scheduling in
     /// this block.
     ///
-    void StartBlock(MachineBasicBlock *BB);
+    void startBlock(MachineBasicBlock *BB);
+
+    /// Initialize the scheduler state for the next scheduling region.
+    virtual void enterRegion(MachineBasicBlock *bb,
+                             MachineBasicBlock::iterator begin,
+                             MachineBasicBlock::iterator end,
+                             unsigned endcount);
+
+    /// Notify that the scheduler has finished scheduling the current region.
+    virtual void exitRegion();
 
     /// Schedule - Schedule the instruction range using list scheduling.
     ///
-    void Schedule();
+    void schedule();
+
+    void EmitSchedule();
 
     /// Observe - Update liveness information to account for the current
     /// instruction, which will not be scheduled.
     ///
     void Observe(MachineInstr *MI, unsigned Count);
 
-    /// FinishBlock - Clean up register live-range state.
+    /// finishBlock - Clean up register live-range state.
     ///
-    void FinishBlock();
+    void finishBlock();
 
     /// FixupKills - Fix register kill flags that have been made
     /// invalid due to scheduling
@@ -170,6 +184,8 @@ namespace {
     // adjustments may be made to the instruction if necessary. Return
     // true if the operand has been deleted, false if not.
     bool ToggleKillFlag(MachineInstr *MI, MachineOperand &MO);
+
+    void dumpSchedule() const;
   };
 }
 
@@ -202,6 +218,35 @@ SchedulePostRATDList::~SchedulePostRATDList() {
   delete AntiDepBreak;
 }
 
+/// Initialize state associated with the next scheduling region.
+void SchedulePostRATDList::enterRegion(MachineBasicBlock *bb,
+                 MachineBasicBlock::iterator begin,
+                 MachineBasicBlock::iterator end,
+                 unsigned endcount) {
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+  Sequence.clear();
+}
+
+/// Print the schedule before exiting the region.
+void SchedulePostRATDList::exitRegion() {
+  DEBUG({
+      dbgs() << "*** Final schedule ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+  ScheduleDAGInstrs::exitRegion();
+}
+
+/// dumpSchedule - dump the scheduled Sequence.
+void SchedulePostRATDList::dumpSchedule() const {
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      SU->dump(this);
+    else
+      dbgs() << "**** NOOP ****\n";
+  }
+}
+
 bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   TII = Fn.getTarget().getInstrInfo();
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
@@ -256,7 +301,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
 #endif
 
     // Initialize register live-range state for scheduling in this block.
-    Scheduler.StartBlock(MBB);
+    Scheduler.startBlock(MBB);
 
     // Schedule each sequence of instructions not interrupted by a label
     // or anything else that effectively needs to shut down scheduling.
@@ -268,7 +313,9 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
       // post-ra we don't gain anything by scheduling across calls since we
       // don't need to worry about register pressure.
       if (MI->isCall() || TII->isSchedulingBoundary(MI, MBB, Fn)) {
-        Scheduler.Run(MBB, I, Current, CurrentCount);
+        Scheduler.enterRegion(MBB, I, Current, CurrentCount);
+        Scheduler.schedule();
+        Scheduler.exitRegion();
         Scheduler.EmitSchedule();
         Current = MI;
         CurrentCount = Count - 1;
@@ -282,11 +329,13 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     assert(Count == 0 && "Instruction count mismatch!");
     assert((MBB->begin() == Current || CurrentCount != 0) &&
            "Instruction count mismatch!");
-    Scheduler.Run(MBB, MBB->begin(), Current, CurrentCount);
+    Scheduler.enterRegion(MBB, MBB->begin(), Current, CurrentCount);
+    Scheduler.schedule();
+    Scheduler.exitRegion();
     Scheduler.EmitSchedule();
 
     // Clean up register live-range state.
-    Scheduler.FinishBlock();
+    Scheduler.finishBlock();
 
     // Update register kills
     Scheduler.FixupKills(MBB);
@@ -298,9 +347,9 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
 /// StartBlock - Initialize register live-range state for scheduling in
 /// this block.
 ///
-void SchedulePostRATDList::StartBlock(MachineBasicBlock *BB) {
+void SchedulePostRATDList::startBlock(MachineBasicBlock *BB) {
   // Call the superclass.
-  ScheduleDAGInstrs::StartBlock(BB);
+  ScheduleDAGInstrs::startBlock(BB);
 
   // Reset the hazard recognizer and anti-dep breaker.
   HazardRec->Reset();
@@ -310,14 +359,14 @@ void SchedulePostRATDList::StartBlock(MachineBasicBlock *BB) {
 
 /// Schedule - Schedule the instruction range using list scheduling.
 ///
-void SchedulePostRATDList::Schedule() {
+void SchedulePostRATDList::schedule() {
   // Build the scheduling graph.
-  BuildSchedGraph(AA);
+  buildSchedGraph(AA);
 
   if (AntiDepBreak != NULL) {
     unsigned Broken =
-      AntiDepBreak->BreakAntiDependencies(SUnits, Begin, InsertPos,
-                                          InsertPosIndex, DbgValues);
+      AntiDepBreak->BreakAntiDependencies(SUnits, RegionBegin, RegionEnd,
+                                          EndIndex, DbgValues);
 
     if (Broken != 0) {
       // We made changes. Update the dependency graph.
@@ -326,11 +375,8 @@ void SchedulePostRATDList::Schedule() {
       // the def's anti-dependence *and* output-dependence edges due to
       // that register, and add new anti-dependence and output-dependence
       // edges based on the next live range of the register.
-      SUnits.clear();
-      Sequence.clear();
-      EntrySU = SUnit();
-      ExitSU = SUnit();
-      BuildSchedGraph(AA);
+      ScheduleDAG::clearDAG();
+      buildSchedGraph(AA);
 
       NumFixedAnti += Broken;
     }
@@ -350,17 +396,17 @@ void SchedulePostRATDList::Schedule() {
 ///
 void SchedulePostRATDList::Observe(MachineInstr *MI, unsigned Count) {
   if (AntiDepBreak != NULL)
-    AntiDepBreak->Observe(MI, Count, InsertPosIndex);
+    AntiDepBreak->Observe(MI, Count, EndIndex);
 }
 
 /// FinishBlock - Clean up register live-range state.
 ///
-void SchedulePostRATDList::FinishBlock() {
+void SchedulePostRATDList::finishBlock() {
   if (AntiDepBreak != NULL)
     AntiDepBreak->FinishBlock();
 
   // Call the superclass.
-  ScheduleDAGInstrs::FinishBlock();
+  ScheduleDAGInstrs::finishBlock();
 }
 
 /// StartBlockForKills - Initialize register live-range state for updating kills
@@ -589,7 +635,7 @@ void SchedulePostRATDList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
 
   ReleaseSuccessors(SU);
   SU->isScheduled = true;
-  AvailableQueue.ScheduledNode(SU);
+  AvailableQueue.scheduledNode(SU);
 }
 
 /// ListScheduleTopDown - The main loop of list scheduling for top-down
@@ -703,6 +749,46 @@ void SchedulePostRATDList::ListScheduleTopDown() {
   }
 
 #ifndef NDEBUG
-  VerifySchedule(/*isBottomUp=*/false);
-#endif
+  unsigned ScheduledNodes = VerifyScheduledDAG(/*isBottomUp=*/false);
+  unsigned Noops = 0;
+  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
+    if (!Sequence[i])
+      ++Noops;
+  assert(Sequence.size() - Noops == ScheduledNodes &&
+         "The number of nodes scheduled doesn't match the expected number!");
+#endif // NDEBUG
+}
+
+// EmitSchedule - Emit the machine code in scheduled order.
+void SchedulePostRATDList::EmitSchedule() {
+  RegionBegin = RegionEnd;
+
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue)
+    BB->splice(RegionEnd, BB, FirstDbgValue);
+
+  // Then re-insert them according to the given schedule.
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      BB->splice(RegionEnd, BB, SU->getInstr());
+    else
+      // Null SUnit* is a noop.
+      TII->insertNoop(*BB, RegionEnd);
+
+    // Update the Begin iterator, as the first instruction in the block
+    // may have been scheduled later.
+    if (i == 0)
+      RegionBegin = prior(RegionEnd);
+  }
+
+  // Reinsert any remaining debug_values.
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrivMI = P.second;
+    BB->splice(++OrigPrivMI, BB, DbgValue);
+  }
+  DbgValues.clear();
+  FirstDbgValue = NULL;
 }
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index c27a485..e09b7f8 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -1139,7 +1139,7 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   // Add the clobber lists for all the instructions we skipped earlier.
   for (SmallPtrSet<const MCInstrDesc*, 4>::const_iterator
        I = SkippedInstrs.begin(), E = SkippedInstrs.end(); I != E; ++I)
-    if (const unsigned *Defs = (*I)->getImplicitDefs())
+    if (const uint16_t *Defs = (*I)->getImplicitDefs())
       while (*Defs)
         MRI->setPhysRegUsed(*Defs++);
 
diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h
index ef0c508..310b933 100644
--- a/lib/CodeGen/RegisterCoalescer.h
+++ b/lib/CodeGen/RegisterCoalescer.h
@@ -47,7 +47,7 @@ namespace llvm {
     /// CrossClass - True when both regs are virtual, and newRC is constrained.
     bool CrossClass;
 
-    /// Flipped - True when DstReg and SrcReg are reversed from the oriignal
+    /// Flipped - True when DstReg and SrcReg are reversed from the original
     /// copy instruction.
     bool Flipped;
 
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 94b28b6..8fd6426 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -46,42 +46,17 @@ ScheduleDAG::ScheduleDAG(MachineFunction &mf)
 
 ScheduleDAG::~ScheduleDAG() {}
 
-/// getInstrDesc helper to handle SDNodes.
-const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
-  if (!Node || !Node->isMachineOpcode()) return NULL;
-  return &TII->get(Node->getMachineOpcode());
-}
-
-/// dump - dump the schedule.
-void ScheduleDAG::dumpSchedule() const {
-  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
-    if (SUnit *SU = Sequence[i])
-      SU->dump(this);
-    else
-      dbgs() << "**** NOOP ****\n";
-  }
-}
-
-
-/// Run - perform scheduling.
-///
-void ScheduleDAG::Run(MachineBasicBlock *bb,
-                      MachineBasicBlock::iterator insertPos) {
-  BB = bb;
-  InsertPos = insertPos;
-
+/// Clear the DAG state (e.g. between scheduling regions).
+void ScheduleDAG::clearDAG() {
   SUnits.clear();
-  Sequence.clear();
   EntrySU = SUnit();
   ExitSU = SUnit();
+}
 
-  Schedule();
-
-  DEBUG({
-      dbgs() << "*** Final schedule ***\n";
-      dumpSchedule();
-      dbgs() << '\n';
-    });
+/// getInstrDesc helper to handle SDNodes.
+const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
+  if (!Node || !Node->isMachineOpcode()) return NULL;
+  return &TII->get(Node->getMachineOpcode());
 }
 
 /// addPred - This adds the specified edge as a pred of the current node if
@@ -346,13 +321,12 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
 }
 
 #ifndef NDEBUG
-/// VerifySchedule - Verify that all SUnits were scheduled and that
-/// their state is consistent.
+/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that
+/// their state is consistent. Return the number of scheduled nodes.
 ///
-void ScheduleDAG::VerifySchedule(bool isBottomUp) {
+unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
   bool AnyNotSched = false;
   unsigned DeadNodes = 0;
-  unsigned Noops = 0;
   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
     if (!SUnits[i].isScheduled) {
       if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) {
@@ -393,12 +367,8 @@ void ScheduleDAG::VerifySchedule(bool isBottomUp) {
       }
     }
   }
-  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
-    if (!Sequence[i])
-      ++Noops;
   assert(!AnyNotSched);
-  assert(Sequence.size() + DeadNodes - Noops == SUnits.size() &&
-         "The number of nodes scheduled doesn't match the expected number!");
+  return SUnits.size() - DeadNodes;
 }
 #endif
 
diff --git a/lib/CodeGen/ScheduleDAGEmit.cpp b/lib/CodeGen/ScheduleDAGEmit.cpp
deleted file mode 100644
index f8b1bc7..0000000
--- a/lib/CodeGen/ScheduleDAGEmit.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-//===---- ScheduleDAGEmit.cpp - Emit routines for the ScheduleDAG class ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This implements the Emit routines for the ScheduleDAG class, which creates
-// MachineInstrs according to the computed schedule.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "pre-RA-sched"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-using namespace llvm;
-
-void ScheduleDAG::EmitNoop() {
-  TII->insertNoop(*BB, InsertPos);
-}
-
-void ScheduleDAG::EmitPhysRegCopy(SUnit *SU,
-                                  DenseMap<SUnit*, unsigned> &VRBaseMap) {
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl()) continue;  // ignore chain preds
-    if (I->getSUnit()->CopyDstRC) {
-      // Copy to physical register.
-      DenseMap<SUnit*, unsigned>::iterator VRI = VRBaseMap.find(I->getSUnit());
-      assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
-      // Find the destination physical register.
-      unsigned Reg = 0;
-      for (SUnit::const_succ_iterator II = SU->Succs.begin(),
-             EE = SU->Succs.end(); II != EE; ++II) {
-        if (II->isCtrl()) continue;  // ignore chain preds
-        if (II->getReg()) {
-          Reg = II->getReg();
-          break;
-        }
-      }
-      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
-        .addReg(VRI->second);
-    } else {
-      // Copy from physical register.
-      assert(I->getReg() && "Unknown physical register!");
-      unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
-      bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
-      (void)isNew; // Silence compiler warning.
-      assert(isNew && "Node emitted out of order - early");
-      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
-        .addReg(I->getReg());
-    }
-    break;
-  }
-}
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index c0ccdb3..6be1ab7 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "sched-instrs"
-#include "ScheduleDAGInstrs.h"
 #include "llvm/Operator.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -22,6 +21,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -38,30 +38,15 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      bool IsPostRAFlag,
                                      LiveIntervals *lis)
   : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()),
-    InstrItins(mf.getTarget().getInstrItineraryData()), IsPostRA(IsPostRAFlag),
-    LIS(lis), UnitLatencies(false), LoopRegs(MLI, MDT), FirstDbgValue(0) {
+    InstrItins(mf.getTarget().getInstrItineraryData()), LIS(lis),
+    IsPostRA(IsPostRAFlag), UnitLatencies(false), LoopRegs(MLI, MDT),
+    FirstDbgValue(0) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
   DbgValues.clear();
   assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
          "Virtual registers must be removed prior to PostRA scheduling");
 }
 
-/// Run - perform scheduling.
-///
-void ScheduleDAGInstrs::Run(MachineBasicBlock *bb,
-                            MachineBasicBlock::iterator begin,
-                            MachineBasicBlock::iterator end,
-                            unsigned endcount) {
-  BB = bb;
-  Begin = begin;
-  InsertPosIndex = endcount;
-
-  // Check to see if the scheduler cares about latencies.
-  UnitLatencies = ForceUnitLatencies();
-
-  ScheduleDAG::Run(bb, end);
-}
-
 /// getUnderlyingObjectFromInt - This is the function that does the work of
 /// looking through basic ptrtoint+arithmetic+inttoptr sequences.
 static const Value *getUnderlyingObjectFromInt(const Value *V) {
@@ -141,28 +126,58 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI,
   return 0;
 }
 
-void ScheduleDAGInstrs::StartBlock(MachineBasicBlock *BB) {
+void ScheduleDAGInstrs::startBlock(MachineBasicBlock *BB) {
   LoopRegs.Deps.clear();
   if (MachineLoop *ML = MLI.getLoopFor(BB))
     if (BB == ML->getLoopLatch())
       LoopRegs.VisitLoop(ML);
 }
 
+void ScheduleDAGInstrs::finishBlock() {
+  // Nothing to do.
+}
+
 /// Initialize the map with the number of registers.
-void ScheduleDAGInstrs::Reg2SUnitsMap::setRegLimit(unsigned Limit) {
+void Reg2SUnitsMap::setRegLimit(unsigned Limit) {
   PhysRegSet.setUniverse(Limit);
   SUnits.resize(Limit);
 }
 
 /// Clear the map without deallocating storage.
-void ScheduleDAGInstrs::Reg2SUnitsMap::clear() {
+void Reg2SUnitsMap::clear() {
   for (const_iterator I = reg_begin(), E = reg_end(); I != E; ++I) {
     SUnits[*I].clear();
   }
   PhysRegSet.clear();
 }
 
-/// AddSchedBarrierDeps - Add dependencies from instructions in the current
+/// Initialize the DAG and common scheduler state for the current scheduling
+/// region. This does not actually create the DAG, only clears it. The
+/// scheduling driver may call BuildSchedGraph multiple times per scheduling
+/// region.
+void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
+                                    MachineBasicBlock::iterator begin,
+                                    MachineBasicBlock::iterator end,
+                                    unsigned endcount) {
+  BB = bb;
+  RegionBegin = begin;
+  RegionEnd = end;
+  EndIndex = endcount;
+  MISUnitMap.clear();
+
+  // Check to see if the scheduler cares about latencies.
+  UnitLatencies = forceUnitLatencies();
+
+  ScheduleDAG::clearDAG();
+}
+
+/// Close the current scheduling region. Don't clear any state in case the
+/// driver wants to refer to the previous scheduling region.
+void ScheduleDAGInstrs::exitRegion() {
+  // Nothing to do.
+}
+
+/// addSchedBarrierDeps - Add dependencies from instructions in the current
 /// list of instructions being scheduled to scheduling barrier by adding
 /// the exit SU to the register defs and use list. This is because we want to
 /// make sure instructions which define registers that are either used by
@@ -170,8 +185,8 @@ void ScheduleDAGInstrs::Reg2SUnitsMap::clear() {
 /// especially important when the definition latency of the return value(s)
 /// are too high to be hidden by the branch or when the liveout registers
 /// used by instructions in the fallthrough block.
-void ScheduleDAGInstrs::AddSchedBarrierDeps() {
-  MachineInstr *ExitMI = InsertPos != BB->end() ? &*InsertPos : 0;
+void ScheduleDAGInstrs::addSchedBarrierDeps() {
+  MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : 0;
   ExitSU.setInstr(ExitMI);
   bool AllDepKnown = ExitMI &&
     (ExitMI->isCall() || ExitMI->isBarrier());
@@ -186,19 +201,21 @@ void ScheduleDAGInstrs::AddSchedBarrierDeps() {
 
       if (TRI->isPhysicalRegister(Reg))
         Uses[Reg].push_back(&ExitSU);
-      else
+      else {
         assert(!IsPostRA && "Virtual register encountered after regalloc.");
+        addVRegUseDeps(&ExitSU, i);
+      }
     }
   } else {
     // For others, e.g. fallthrough, conditional branch, assume the exit
     // uses all the registers that are livein to the successor blocks.
-    SmallSet<unsigned, 8> Seen;
+    assert(Uses.empty() && "Uses in set before adding deps?");
     for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
            SE = BB->succ_end(); SI != SE; ++SI)
       for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
              E = (*SI)->livein_end(); I != E; ++I) {
         unsigned Reg = *I;
-        if (Seen.insert(Reg))
+        if (!Uses.contains(Reg))
           Uses[Reg].push_back(&ExitSU);
       }
   }
@@ -246,7 +263,7 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU,
       // perform its own adjustments.
       const SDep& dep = SDep(SU, SDep::Data, LDataLatency, *Alias);
       if (!UnitLatencies) {
-        ComputeOperandLatency(SU, UseSU, const_cast<SDep &>(dep));
+        computeOperandLatency(SU, UseSU, const_cast<SDep &>(dep));
         ST.adjustSchedDependency(SU, UseSU, const_cast<SDep &>(dep));
       }
       UseSU->addPred(dep);
@@ -436,7 +453,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
       if (!UnitLatencies) {
         // Adjust the dependence latency using operand def/use information, then
         // allow the target to perform its own adjustments.
-        ComputeOperandLatency(DefSU, SU, const_cast<SDep &>(dep));
+        computeOperandLatency(DefSU, SU, const_cast<SDep &>(dep));
         const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
         ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
       }
@@ -455,20 +472,23 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
 ///
 /// Map each real instruction to its SUnit.
 ///
-/// After initSUnits, the SUnits vector is cannot be resized and the scheduler
-/// may hang onto SUnit pointers. We may relax this in the future by using SUnit
-/// IDs instead of pointers.
+/// After initSUnits, the SUnits vector cannot be resized and the scheduler may
+/// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs
+/// instead of pointers.
+///
+/// MachineScheduler relies on initSUnits numbering the nodes by their order in
+/// the original instruction list.
 void ScheduleDAGInstrs::initSUnits() {
   // We'll be allocating one SUnit for each real instruction in the region,
   // which is contained within a basic block.
   SUnits.reserve(BB->size());
 
-  for (MachineBasicBlock::iterator I = Begin; I != InsertPos; ++I) {
+  for (MachineBasicBlock::iterator I = RegionBegin; I != RegionEnd; ++I) {
     MachineInstr *MI = I;
     if (MI->isDebugValue())
       continue;
 
-    SUnit *SU = NewSUnit(MI);
+    SUnit *SU = newSUnit(MI);
     MISUnitMap[MI] = SU;
 
     SU->isCall = MI->isCall();
@@ -478,11 +498,11 @@ void ScheduleDAGInstrs::initSUnits() {
     if (UnitLatencies)
       SU->Latency = 1;
     else
-      ComputeLatency(SU);
+      computeLatency(SU);
   }
 }
 
-void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
+void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
   // Create an SUnit for each real instruction.
   initSUnits();
 
@@ -517,11 +537,11 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
-  AddSchedBarrierDeps();
+  addSchedBarrierDeps();
 
   // Walk the list of instructions, from bottom moving up.
   MachineInstr *PrevMI = NULL;
-  for (MachineBasicBlock::iterator MII = InsertPos, MIE = Begin;
+  for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
        MII != MIE; --MII) {
     MachineInstr *MI = prior(MII);
     if (MI && PrevMI) {
@@ -712,14 +732,9 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
   Uses.clear();
   VRegDefs.clear();
   PendingLoads.clear();
-  MISUnitMap.clear();
-}
-
-void ScheduleDAGInstrs::FinishBlock() {
-  // Nothing to do.
 }
 
-void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) {
+void ScheduleDAGInstrs::computeLatency(SUnit *SU) {
   // Compute the latency for the node.
   if (!InstrItins || InstrItins->isEmpty()) {
     SU->Latency = 1;
@@ -733,7 +748,7 @@ void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) {
   }
 }
 
-void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use,
+void ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use,
                                               SDep& dep) const {
   if (!InstrItins || InstrItins->isEmpty())
     return;
@@ -808,37 +823,8 @@ std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
   return oss.str();
 }
 
-// EmitSchedule - Emit the machine code in scheduled order.
-MachineBasicBlock *ScheduleDAGInstrs::EmitSchedule() {
-  Begin = InsertPos;
-
-  // If first instruction was a DBG_VALUE then put it back.
-  if (FirstDbgValue)
-    BB->splice(InsertPos, BB, FirstDbgValue);
-
-  // Then re-insert them according to the given schedule.
-  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
-    if (SUnit *SU = Sequence[i])
-      BB->splice(InsertPos, BB, SU->getInstr());
-    else
-      // Null SUnit* is a noop.
-      EmitNoop();
-
-    // Update the Begin iterator, as the first instruction in the block
-    // may have been scheduled later.
-    if (i == 0)
-      Begin = prior(InsertPos);
-  }
-
-  // Reinsert any remaining debug_values.
-  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
-         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
-    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
-    MachineInstr *DbgValue = P.first;
-    MachineBasicBlock::iterator OrigPrivMI = P.second;
-    BB->splice(++OrigPrivMI, BB, DbgValue);
-  }
-  DbgValues.clear();
-  FirstDbgValue = NULL;
-  return BB;
+/// Return the basic block label. It is not necessarilly unique because a block
+/// contains multiple scheduling regions. But it is fine for visualization.
+std::string ScheduleDAGInstrs::getDAGName() const {
+  return "dag." + BB->getFullName();
 }
diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h
deleted file mode 100644
index c7ffed9..0000000
--- a/lib/CodeGen/ScheduleDAGInstrs.h
+++ /dev/null
@@ -1,306 +0,0 @@
-//==- ScheduleDAGInstrs.h - MachineInstr Scheduling --------------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ScheduleDAGInstrs class, which implements
-// scheduling for a MachineInstr-based dependency graph.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SCHEDULEDAGINSTRS_H
-#define SCHEDULEDAGINSTRS_H
-
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SparseSet.h"
-#include <map>
-
-namespace llvm {
-  class MachineLoopInfo;
-  class MachineDominatorTree;
-  class LiveIntervals;
-
-  /// LoopDependencies - This class analyzes loop-oriented register
-  /// dependencies, which are used to guide scheduling decisions.
-  /// For example, loop induction variable increments should be
-  /// scheduled as soon as possible after the variable's last use.
-  ///
-  class LLVM_LIBRARY_VISIBILITY LoopDependencies {
-    const MachineLoopInfo &MLI;
-    const MachineDominatorTree &MDT;
-
-  public:
-    typedef std::map<unsigned, std::pair<const MachineOperand *, unsigned> >
-      LoopDeps;
-    LoopDeps Deps;
-
-    LoopDependencies(const MachineLoopInfo &mli,
-                     const MachineDominatorTree &mdt) :
-      MLI(mli), MDT(mdt) {}
-
-    /// VisitLoop - Clear out any previous state and analyze the given loop.
-    ///
-    void VisitLoop(const MachineLoop *Loop) {
-      assert(Deps.empty() && "stale loop dependencies");
-
-      MachineBasicBlock *Header = Loop->getHeader();
-      SmallSet<unsigned, 8> LoopLiveIns;
-      for (MachineBasicBlock::livein_iterator LI = Header->livein_begin(),
-           LE = Header->livein_end(); LI != LE; ++LI)
-        LoopLiveIns.insert(*LI);
-
-      const MachineDomTreeNode *Node = MDT.getNode(Header);
-      const MachineBasicBlock *MBB = Node->getBlock();
-      assert(Loop->contains(MBB) &&
-             "Loop does not contain header!");
-      VisitRegion(Node, MBB, Loop, LoopLiveIns);
-    }
-
-  private:
-    void VisitRegion(const MachineDomTreeNode *Node,
-                     const MachineBasicBlock *MBB,
-                     const MachineLoop *Loop,
-                     const SmallSet<unsigned, 8> &LoopLiveIns) {
-      unsigned Count = 0;
-      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
-           I != E; ++I) {
-        const MachineInstr *MI = I;
-        if (MI->isDebugValue())
-          continue;
-        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-          const MachineOperand &MO = MI->getOperand(i);
-          if (!MO.isReg() || !MO.isUse())
-            continue;
-          unsigned MOReg = MO.getReg();
-          if (LoopLiveIns.count(MOReg))
-            Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count)));
-        }
-        ++Count; // Not every iteration due to dbg_value above.
-      }
-
-      const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
-      for (std::vector<MachineDomTreeNode*>::const_iterator I =
-           Children.begin(), E = Children.end(); I != E; ++I) {
-        const MachineDomTreeNode *ChildNode = *I;
-        MachineBasicBlock *ChildBlock = ChildNode->getBlock();
-        if (Loop->contains(ChildBlock))
-          VisitRegion(ChildNode, ChildBlock, Loop, LoopLiveIns);
-      }
-    }
-  };
-
-  /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
-  /// MachineInstrs.
-  class LLVM_LIBRARY_VISIBILITY ScheduleDAGInstrs : public ScheduleDAG {
-    const MachineLoopInfo &MLI;
-    const MachineDominatorTree &MDT;
-    const MachineFrameInfo *MFI;
-    const InstrItineraryData *InstrItins;
-
-    /// isPostRA flag indicates vregs cannot be present.
-    bool IsPostRA;
-
-    /// Live Intervals provides reaching defs in preRA scheduling.
-    LiveIntervals *LIS;
-
-    DenseMap<MachineInstr*, SUnit*> MISUnitMap;
-
-    /// UnitLatencies (misnamed) flag avoids computing def-use latencies, using
-    /// the def-side latency only.
-    bool UnitLatencies;
-
-    /// Combine a SparseSet with a 1x1 vector to track physical registers.
-    /// The SparseSet allows iterating over the (few) live registers for quickly
-    /// comparing against a regmask or clearing the set.
-    ///
-    /// Storage for the map is allocated once for the pass. The map can be
-    /// cleared between scheduling regions without freeing unused entries.
-    class Reg2SUnitsMap {
-      SparseSet<unsigned> PhysRegSet;
-      std::vector<std::vector<SUnit*> > SUnits;
-    public:
-      typedef SparseSet<unsigned>::const_iterator const_iterator;
-
-      // Allow iteration over register numbers (keys) in the map. If needed, we
-      // can provide an iterator over SUnits (values) as well.
-      const_iterator reg_begin() const { return PhysRegSet.begin(); }
-      const_iterator reg_end() const { return PhysRegSet.end(); }
-
-      /// Initialize the map with the number of registers.
-      /// If the map is already large enough, no allocation occurs.
-      /// For simplicity we expect the map to be empty().
-      void setRegLimit(unsigned Limit);
-
-      /// Returns true if the map is empty.
-      bool empty() const { return PhysRegSet.empty(); }
-
-      /// Clear the map without deallocating storage.
-      void clear();
-
-      bool contains(unsigned Reg) const { return PhysRegSet.count(Reg); }
-
-      /// If this register is mapped, return its existing SUnits vector.
-      /// Otherwise map the register and return an empty SUnits vector.
-      std::vector<SUnit *> &operator[](unsigned Reg) {
-        bool New = PhysRegSet.insert(Reg).second;
-        assert((!New || SUnits[Reg].empty()) && "stale SUnits vector");
-        (void)New;
-        return SUnits[Reg];
-      }
-
-      /// Erase an existing element without freeing memory.
-      void erase(unsigned Reg) {
-        PhysRegSet.erase(Reg);
-        SUnits[Reg].clear();
-      }
-    };
-    /// Defs, Uses - Remember where defs and uses of each register are as we
-    /// iterate upward through the instructions. This is allocated here instead
-    /// of inside BuildSchedGraph to avoid the need for it to be initialized and
-    /// destructed for each block.
-    Reg2SUnitsMap Defs;
-    Reg2SUnitsMap Uses;
-
-    /// An individual mapping from virtual register number to SUnit.
-    struct VReg2SUnit {
-      unsigned VirtReg;
-      SUnit *SU;
-
-      VReg2SUnit(unsigned reg, SUnit *su): VirtReg(reg), SU(su) {}
-
-      unsigned getSparseSetKey() const {
-        return TargetRegisterInfo::virtReg2Index(VirtReg);
-      }
-    };
-    /// Use SparseSet as a SparseMap by relying on the fact that it never
-    /// compares ValueT's, only unsigned keys. This allows the set to be cleared
-    /// between scheduling regions in constant time as long as ValueT does not
-    /// require a destructor.
-    typedef SparseSet<VReg2SUnit> VReg2SUnitMap;
-    /// Track the last instructon in this region defining each virtual register.
-    VReg2SUnitMap VRegDefs;
-
-    /// PendingLoads - Remember where unknown loads are after the most recent
-    /// unknown store, as we iterate. As with Defs and Uses, this is here
-    /// to minimize construction/destruction.
-    std::vector<SUnit *> PendingLoads;
-
-    /// LoopRegs - Track which registers are used for loop-carried dependencies.
-    ///
-    LoopDependencies LoopRegs;
-
-  protected:
-
-    /// DbgValues - Remember instruction that preceeds DBG_VALUE.
-    typedef std::vector<std::pair<MachineInstr *, MachineInstr *> >
-      DbgValueVector;
-    DbgValueVector DbgValues;
-    MachineInstr *FirstDbgValue;
-
-  public:
-    MachineBasicBlock::iterator Begin;    // The beginning of the range to
-                                          // be scheduled. The range extends
-                                          // to InsertPos.
-    unsigned InsertPosIndex;              // The index in BB of InsertPos.
-
-    explicit ScheduleDAGInstrs(MachineFunction &mf,
-                               const MachineLoopInfo &mli,
-                               const MachineDominatorTree &mdt,
-                               bool IsPostRAFlag,
-                               LiveIntervals *LIS = 0);
-
-    virtual ~ScheduleDAGInstrs() {}
-
-    /// NewSUnit - Creates a new SUnit and return a ptr to it.
-    ///
-    SUnit *NewSUnit(MachineInstr *MI) {
-#ifndef NDEBUG
-      const SUnit *Addr = SUnits.empty() ? 0 : &SUnits[0];
-#endif
-      SUnits.push_back(SUnit(MI, (unsigned)SUnits.size()));
-      assert((Addr == 0 || Addr == &SUnits[0]) &&
-             "SUnits std::vector reallocated on the fly!");
-      SUnits.back().OrigNode = &SUnits.back();
-      return &SUnits.back();
-    }
-
-
-    /// Run - perform scheduling.
-    ///
-    void Run(MachineBasicBlock *bb,
-             MachineBasicBlock::iterator begin,
-             MachineBasicBlock::iterator end,
-             unsigned endindex);
-
-    /// BuildSchedGraph - Build SUnits from the MachineBasicBlock that we are
-    /// input.
-    virtual void BuildSchedGraph(AliasAnalysis *AA);
-
-    /// AddSchedBarrierDeps - Add dependencies from instructions in the current
-    /// list of instructions being scheduled to scheduling barrier. We want to
-    /// make sure instructions which define registers that are either used by
-    /// the terminator or are live-out are properly scheduled. This is
-    /// especially important when the definition latency of the return value(s)
-    /// are too high to be hidden by the branch or when the liveout registers
-    /// used by instructions in the fallthrough block.
-    void AddSchedBarrierDeps();
-
-    /// ComputeLatency - Compute node latency.
-    ///
-    virtual void ComputeLatency(SUnit *SU);
-
-    /// ComputeOperandLatency - Override dependence edge latency using
-    /// operand use/def information
-    ///
-    virtual void ComputeOperandLatency(SUnit *Def, SUnit *Use,
-                                       SDep& dep) const;
-
-    virtual MachineBasicBlock *EmitSchedule();
-
-    /// StartBlock - Prepare to perform scheduling in the given block.
-    ///
-    virtual void StartBlock(MachineBasicBlock *BB);
-
-    /// Schedule - Order nodes according to selected style, filling
-    /// in the Sequence member.
-    ///
-    virtual void Schedule() = 0;
-
-    /// FinishBlock - Clean up after scheduling in the given block.
-    ///
-    virtual void FinishBlock();
-
-    virtual void dumpNode(const SUnit *SU) const;
-
-    virtual std::string getGraphNodeLabel(const SUnit *SU) const;
-
-  protected:
-    SUnit *getSUnit(MachineInstr *MI) const {
-      DenseMap<MachineInstr*, SUnit*>::const_iterator I = MISUnitMap.find(MI);
-      if (I == MISUnitMap.end())
-        return 0;
-      return I->second;
-    }
-
-    void initSUnits();
-    void addPhysRegDataDeps(SUnit *SU, const MachineOperand &MO);
-    void addPhysRegDeps(SUnit *SU, unsigned OperIdx);
-    void addVRegDefDeps(SUnit *SU, unsigned OperIdx);
-    void addVRegUseDeps(SUnit *SU, unsigned OperIdx);
-
-    VReg2SUnitMap::iterator findVRegDef(unsigned VirtReg) {
-      return VRegDefs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
-    }
-  };
-}
-
-#endif
diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
index 4251583..38feee9 100644
--- a/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -41,12 +41,12 @@ namespace llvm {
     static bool renderGraphFromBottomUp() {
       return true;
     }
-    
+
     static bool hasNodeAddressLabel(const SUnit *Node,
                                     const ScheduleDAG *Graph) {
       return true;
     }
-    
+
     /// If you want to override the dot attributes printed for a particular
     /// edge, override this method.
     static std::string getEdgeAttributes(const SUnit *Node,
@@ -58,7 +58,7 @@ namespace llvm {
         return "color=blue,style=dashed";
       return "";
     }
-    
+
 
     std::string getNodeLabel(const SUnit *Node, const ScheduleDAG *Graph);
     static std::string getNodeAttributes(const SUnit *N,
@@ -81,18 +81,17 @@ std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
 /// rendered using 'dot'.
 ///
-void ScheduleDAG::viewGraph() {
-// This code is only for debugging!
+void ScheduleDAG::viewGraph(const Twine &Name, const Twine &Title) {
+  // This code is only for debugging!
 #ifndef NDEBUG
-  if (BB->getBasicBlock())
-    ViewGraph(this, "dag." + MF.getFunction()->getName(), false,
-              "Scheduling-Units Graph for " + MF.getFunction()->getName() +
-              ":" + BB->getBasicBlock()->getName());
-  else
-    ViewGraph(this, "dag." + MF.getFunction()->getName(), false,
-              "Scheduling-Units Graph for " + MF.getFunction()->getName());
+  ViewGraph(this, Name, false, Title);
 #else
   errs() << "ScheduleDAG::viewGraph is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
 #endif  // NDEBUG
 }
+
+/// Out-of-line implementation with no arguments is handy for gdb.
+void ScheduleDAG::viewGraph() {
+  viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
+}
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index 9a79217..a6bdc3b 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_library(LLVMSelectionDAG
   ScheduleDAGSDNodes.cpp
   SelectionDAG.cpp
   SelectionDAGBuilder.cpp
+  SelectionDAGDumper.cpp
   SelectionDAGISel.cpp
   SelectionDAGPrinter.cpp
   ScheduleDAGVLIW.cpp
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1b148ad..7c4db97 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -80,7 +80,7 @@ namespace {
     // visit, we pop off the order stack until we find an item that is
     // also in the contents set. All operations are O(log N).
     SmallPtrSet<SDNode*, 64> WorkListContents;
-    std::vector<SDNode*> WorkListOrder;
+    SmallVector<SDNode*, 64> WorkListOrder;
 
     // AA - Used for DAG load/store alias analysis.
     AliasAnalysis &AA;
@@ -381,6 +381,7 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
 /// specified expression for the same cost as the expression itself, or 2 if we
 /// can compute the negated form more cheaply than the expression itself.
 static char isNegatibleForFree(SDValue Op, bool LegalOperations,
+                               const TargetLowering &TLI,
                                const TargetOptions *Options,
                                unsigned Depth = 0) {
   // No compile time optimizations on this type.
@@ -406,12 +407,17 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
     // FIXME: determine better conditions for this xform.
     if (!Options->UnsafeFPMath) return 0;
 
+    // After operation legalization, it might not be legal to create new FSUBs.
+    if (LegalOperations &&
+        !TLI.isOperationLegalOrCustom(ISD::FSUB,  Op.getValueType()))
+      return 0;
+
     // fold (fsub (fadd A, B)) -> (fsub (fneg A), B)
-    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, Options,
-                                    Depth + 1))
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
+                                    Options, Depth + 1))
       return V;
     // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
-    return isNegatibleForFree(Op.getOperand(1), LegalOperations, Options,
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
                               Depth + 1);
   case ISD::FSUB:
     // We can't turn -(A-B) into B-A when we honor signed zeros.
@@ -425,17 +431,17 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
     if (Options->HonorSignDependentRoundingFPMath()) return 0;
 
     // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
-    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, Options,
-                                    Depth + 1))
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
+                                    Options, Depth + 1))
       return V;
 
-    return isNegatibleForFree(Op.getOperand(1), LegalOperations, Options,
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
                               Depth + 1);
 
   case ISD::FP_EXTEND:
   case ISD::FP_ROUND:
   case ISD::FSIN:
-    return isNegatibleForFree(Op.getOperand(0), LegalOperations, Options,
+    return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
                               Depth + 1);
   }
 }
@@ -464,6 +470,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
     // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
+                           DAG.getTargetLoweringInfo(),
                            &DAG.getTarget().Options, Depth+1))
       return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
@@ -493,6 +500,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
     // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
+                           DAG.getTargetLoweringInfo(),
                            &DAG.getTarget().Options, Depth+1))
       return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
@@ -997,8 +1005,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
     // worklist *should* contain, and check the node we want to visit is should
     // actually be visited.
     do {
-      N = WorkListOrder.back();
-      WorkListOrder.pop_back();
+      N = WorkListOrder.pop_back_val();
     } while (!WorkListContents.erase(N));
 
     // If N has no uses, it is dead.  Make sure to revisit all N's operands once
@@ -5507,11 +5514,13 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       N1CFP->getValueAPF().isZero())
     return N0;
   // fold (fadd A, (fneg B)) -> (fsub A, B)
-  if (isNegatibleForFree(N1, LegalOperations, &DAG.getTarget().Options) == 2)
+  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
+      isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
     return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
   // fold (fadd (fneg A), B) -> (fsub B, A)
-  if (isNegatibleForFree(N0, LegalOperations, &DAG.getTarget().Options) == 2)
+  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
+      isNegatibleForFree(N0, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
     return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations));
 
@@ -5549,16 +5558,33 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   // fold (fsub 0, B) -> -B
   if (DAG.getTarget().Options.UnsafeFPMath &&
       N0CFP && N0CFP->getValueAPF().isZero()) {
-    if (isNegatibleForFree(N1, LegalOperations, &DAG.getTarget().Options))
+    if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
       return GetNegatedExpression(N1, DAG, LegalOperations);
     if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
       return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N1);
   }
   // fold (fsub A, (fneg B)) -> (fadd A, B)
-  if (isNegatibleForFree(N1, LegalOperations, &DAG.getTarget().Options))
+  if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
     return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
 
+  // If 'unsafe math' is enabled, fold
+  //    (fsub x, (fadd x, y)) -> (fneg y) &
+  //    (fsub x, (fadd y, x)) -> (fneg y)
+  if (DAG.getTarget().Options.UnsafeFPMath) {
+    if (N1.getOpcode() == ISD::FADD) {
+      SDValue N10 = N1->getOperand(0);
+      SDValue N11 = N1->getOperand(1);
+
+      if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI,
+                                          &DAG.getTarget().Options))
+        return GetNegatedExpression(N11, DAG, LegalOperations);
+      else if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI,
+                                               &DAG.getTarget().Options))
+        return GetNegatedExpression(N10, DAG, LegalOperations);
+    }
+  }
+
   return SDValue();
 }
 
@@ -5568,6 +5594,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // fold vector ops
   if (VT.isVector()) {
@@ -5598,9 +5625,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
       return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N0);
 
   // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations,
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
                                        &DAG.getTarget().Options)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations,
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, 
                                          &DAG.getTarget().Options)) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
@@ -5628,6 +5655,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // fold vector ops
   if (VT.isVector()) {
@@ -5641,9 +5669,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
 
   // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations,
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
                                        &DAG.getTarget().Options)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations,
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI,
                                          &DAG.getTarget().Options)) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
@@ -5897,7 +5925,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
-  if (isNegatibleForFree(N0, LegalOperations, &DAG.getTarget().Options))
+  if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
+                         &DAG.getTarget().Options))
     return GetNegatedExpression(N0, DAG, LegalOperations);
 
   // Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading
@@ -6129,8 +6158,7 @@ SDValue DAGCombiner::visitBR_CC(SDNode *N) {
 
 /// canFoldInAddressingMode - Return true if 'Use' is a load or a store that
 /// uses N as its base pointer and that N may be folded in the load / store
-/// addressing mode. FIXME: This currently only looks for folding of
-/// [reg +/- imm] addressing modes.
+/// addressing mode.
 static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
                                     SelectionDAG &DAG,
                                     const TargetLowering &TLI) {
@@ -6150,15 +6178,19 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
   if (N->getOpcode() == ISD::ADD) {
     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
     if (Offset)
+      // [reg +/- imm]
       AM.BaseOffs = Offset->getSExtValue();
     else
-      return false;
+      // [reg +/- reg]
+      AM.Scale = 1;
   } else if (N->getOpcode() == ISD::SUB) {
     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
     if (Offset)
+      // [reg +/- imm]
       AM.BaseOffs = -Offset->getSExtValue();
     else
-      return false;
+      // [reg +/- reg]
+      AM.Scale = 1;
   } else
     return false;
 
@@ -7187,6 +7219,11 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     EVT ExtVT = VT.getVectorElementType();
     EVT LVT = ExtVT;
 
+    // If the result of load has to be truncated, then it's not necessarily
+    // profitable.
+    if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
+      return SDValue();
+
     if (InVec.getOpcode() == ISD::BITCAST) {
       // Don't duplicate a load with other uses.
       if (!InVec.hasOneUse())
@@ -7287,17 +7324,36 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // Note that this replacement assumes that the extractvalue is the only
     // use of the load; that's okay because we don't want to perform this
     // transformation in other cases anyway.
-    SDValue Load = DAG.getLoad(LVT, N->getDebugLoc(), LN0->getChain(), NewPtr,
-                               LN0->getPointerInfo().getWithOffset(PtrOff),
-                               LN0->isVolatile(), LN0->isNonTemporal(), 
-                               LN0->isInvariant(), Align);
+    SDValue Load;
+    SDValue Chain;
+    if (NVT.bitsGT(LVT)) {
+      // If the result type of vextract is wider than the load, then issue an
+      // extending load instead.
+      ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, LVT)
+        ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+      Load = DAG.getExtLoad(ExtType, N->getDebugLoc(), NVT, LN0->getChain(),
+                            NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff),
+                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),Align);
+      Chain = Load.getValue(1);
+    } else {
+      Load = DAG.getLoad(LVT, N->getDebugLoc(), LN0->getChain(), NewPtr,
+                         LN0->getPointerInfo().getWithOffset(PtrOff),
+                         LN0->isVolatile(), LN0->isNonTemporal(), 
+                         LN0->isInvariant(), Align);
+      Chain = Load.getValue(1);
+      if (NVT.bitsLT(LVT))
+        Load = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), NVT, Load);
+      else
+        Load = DAG.getNode(ISD::BITCAST, N->getDebugLoc(), NVT, Load);
+    }
     WorkListRemover DeadNodes(*this);
     SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
-    SDValue To[] = { Load.getValue(0), Load.getValue(1) };
+    SDValue To[] = { Load, Chain };
     DAG.ReplaceAllUsesOfValuesWith(From, To, 2, &DeadNodes);
     // Since we're explcitly calling ReplaceAllUses, add the new node to the
     // worklist explicitly as well.
     AddToWorkList(Load.getNode());
+    AddUsersToWorkList(Load.getNode()); // Add users too
     // Make sure to revisit this node to clean it up; it will usually be dead.
     AddToWorkList(N);
     return SDValue(N, 0);
@@ -7367,6 +7423,8 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   // will be type-legalized to complex code sequences.
   // We perform this optimization only before the operation legalizer because we
   // may introduce illegal operations.
+  // Create a new simpler BUILD_VECTOR sequence which other optimizations can
+  // turn into a single shuffle instruction.
   if ((Level == AfterLegalizeVectorOps || Level == AfterLegalizeTypes) &&
       ValidTypes) {
     bool isLE = TLI.isLittleEndian();
@@ -7407,6 +7465,8 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
                                  VecVT, &Ops[0], Ops.size());
 
+    // The new BUILD_VECTOR node has the potential to be further optimized.
+    AddToWorkList(BV.getNode());
     // Bitcast to the desired type.
     return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), BV);
   }
@@ -7414,6 +7474,12 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
   // operations.  If so, and if the EXTRACT_VECTOR_ELT vector inputs come from
   // at most two distinct vectors, turn this into a shuffle node.
+
+  // May only combine to shuffle after legalize if shuffle is legal.
+  if (LegalOperations &&
+      !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))
+    return SDValue();
+
   SDValue VecIn1, VecIn2;
   for (unsigned i = 0; i != NumInScalars; ++i) {
     // Ignore undef inputs.
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index fd8ce78..9f4a44a 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -136,15 +136,8 @@ unsigned FastISel::getRegForValue(const Value *V) {
       return 0;
   }
 
-  // Look up the value to see if we already have a register for it. We
-  // cache values defined by Instructions across blocks, and other values
-  // only locally. This is because Instructions already have the SSA
-  // def-dominates-use requirement enforced.
-  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V);
-  if (I != FuncInfo.ValueMap.end())
-    return I->second;
-
-  unsigned Reg = LocalValueMap[V];
+  // Look up the value to see if we already have a register for it.
+  unsigned Reg = lookUpRegForValue(V);
   if (Reg != 0)
     return Reg;
 
@@ -199,7 +192,7 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
       uint32_t IntBitWidth = IntVT.getSizeInBits();
       bool isExact;
       (void) Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
-                                APFloat::rmTowardZero, &isExact);
+                                  APFloat::rmTowardZero, &isExact);
       if (isExact) {
         APInt IntVal(IntBitWidth, x);
 
@@ -577,12 +570,16 @@ bool FastISel::SelectCall(const User *I) {
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call);
     if (!DIVariable(DI->getVariable()).Verify() ||
-        !FuncInfo.MF->getMMI().hasDebugInfo())
+        !FuncInfo.MF->getMMI().hasDebugInfo()) {
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
       return true;
+    }
 
     const Value *Address = DI->getAddress();
-    if (!Address || isa<UndefValue>(Address) || isa<AllocaInst>(Address))
+    if (!Address || isa<UndefValue>(Address)) {
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
       return true;
+    }
 
     unsigned Reg = 0;
     unsigned Offset = 0;
@@ -590,16 +587,25 @@ bool FastISel::SelectCall(const User *I) {
       // Some arguments' frame index is recorded during argument lowering.
       Offset = FuncInfo.getArgumentFrameIndex(Arg);
       if (Offset)
-	Reg = TRI.getFrameRegister(*FuncInfo.MF);
+        Reg = TRI.getFrameRegister(*FuncInfo.MF);
     }
     if (!Reg)
-      Reg = getRegForValue(Address);
+      Reg = lookUpRegForValue(Address);
+
+    if (!Reg && isa<Instruction>(Address) &&
+        (!isa<AllocaInst>(Address) ||
+         !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(Address))))
+      Reg = FuncInfo.InitializeRegForValue(Address);
 
     if (Reg)
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
               TII.get(TargetOpcode::DBG_VALUE))
         .addReg(Reg, RegState::Debug).addImm(Offset)
         .addMetadata(DI->getVariable());
+    else
+      // We can't yet handle anything else here because it would require
+      // generating code, thus altering codegen because of debug info.
+      DEBUG(dbgs() << "Dropping debug info for " << DI);
     return true;
   }
   case Intrinsic::dbg_value: {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 31df458..1b84b13 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3032,6 +3032,16 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Results[0].getValue(1));
     break;
   }
+  case ISD::FSUB: {
+    EVT VT = Node->getValueType(0);
+    assert(TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
+           TLI.isOperationLegalOrCustom(ISD::FNEG, VT) &&
+           "Don't know how to expand this FP subtraction!");
+    Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
+    Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
   case ISD::SUB: {
     EVT VT = Node->getValueType(0);
     assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
@@ -3590,10 +3600,11 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
                                   Tmp1, Tmp2, Node->getOperand(2)));
     break;
   }
+  case ISD::FDIV:
   case ISD::FPOW: {
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
-    Tmp3 = DAG.getNode(ISD::FPOW, dl, NVT, Tmp1, Tmp2);
+    Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
     Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
                                   Tmp3, DAG.getIntPtrConstant(0)));
     break;
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 1a27f3f..ff0136e 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -470,7 +470,7 @@ signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
 
 
 /// Main resource tracking point.
-void ResourcePriorityQueue::ScheduledNode(SUnit *SU) {
+void ResourcePriorityQueue::scheduledNode(SUnit *SU) {
   // Use NULL entry as an event marker to reset
   // the DFA state.
   if (!SU) {
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 34ee1f3..24da432 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -43,7 +43,7 @@ namespace {
     SmallVector<SUnit *, 16> Queue;
 
     bool empty() const { return Queue.empty(); }
-    
+
     void push(SUnit *U) {
       Queue.push_back(U);
     }
@@ -101,8 +101,8 @@ private:
   bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
   void ListScheduleBottomUp();
 
-  /// ForceUnitLatencies - The fast scheduler doesn't care about real latencies.
-  bool ForceUnitLatencies() const { return true; }
+  /// forceUnitLatencies - The fast scheduler doesn't care about real latencies.
+  bool forceUnitLatencies() const { return true; }
 };
 }  // end anonymous namespace
 
@@ -112,7 +112,7 @@ void ScheduleDAGFast::Schedule() {
   DEBUG(dbgs() << "********** List Scheduling **********\n");
 
   NumLiveRegs = 0;
-  LiveRegDefs.resize(TRI->getNumRegs(), NULL);  
+  LiveRegDefs.resize(TRI->getNumRegs(), NULL);
   LiveRegCycles.resize(TRI->getNumRegs(), 0);
 
   // Build the scheduling graph.
@@ -159,7 +159,7 @@ void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
     ReleasePred(SU, &*I);
     if (I->isAssignedRegDep()) {
       // This is a physical register dependency and it's impossible or
-      // expensive to copy the register. Make sure nothing that can 
+      // expensive to copy the register. Make sure nothing that can
       // clobber the register is scheduled between the predecessor and
       // this node.
       if (!LiveRegDefs[I->getReg()]) {
@@ -245,10 +245,10 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
     DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
                                    SDValue(LoadNode, 1));
 
-    SUnit *NewSU = NewSUnit(N);
+    SUnit *NewSU = newSUnit(N);
     assert(N->getNodeId() == -1 && "Node already inserted!");
     N->setNodeId(NewSU->NodeNum);
-      
+
     const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
     for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
       if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
@@ -268,7 +268,7 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
       LoadSU = &SUnits[LoadNode->getNodeId()];
       isNewLoad = false;
     } else {
-      LoadSU = NewSUnit(LoadNode);
+      LoadSU = newSUnit(LoadNode);
       LoadNode->setNodeId(LoadSU->NodeNum);
     }
 
@@ -329,7 +329,7 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
         D.setSUnit(LoadSU);
         AddPred(SuccDep, D);
       }
-    } 
+    }
     if (isNewLoad) {
       AddPred(NewSU, SDep(LoadSU, SDep::Order, LoadSU->Latency));
     }
@@ -381,11 +381,11 @@ void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
                                               const TargetRegisterClass *DestRC,
                                               const TargetRegisterClass *SrcRC,
                                                SmallVector<SUnit*, 2> &Copies) {
-  SUnit *CopyFromSU = NewSUnit(static_cast<SDNode *>(NULL));
+  SUnit *CopyFromSU = newSUnit(static_cast<SDNode *>(NULL));
   CopyFromSU->CopySrcRC = SrcRC;
   CopyFromSU->CopyDstRC = DestRC;
 
-  SUnit *CopyToSU = NewSUnit(static_cast<SDNode *>(NULL));
+  SUnit *CopyToSU = newSUnit(static_cast<SDNode *>(NULL));
   CopyToSU->CopySrcRC = DestRC;
   CopyToSU->CopyDstRC = SrcRC;
 
@@ -425,7 +425,7 @@ static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
   assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
   unsigned NumRes = MCID.getNumDefs();
-  for (const unsigned *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+  for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
     if (Reg == *ImpDef)
       break;
     ++NumRes;
@@ -508,7 +508,7 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
-    for (const unsigned *Reg = MCID.ImplicitDefs; *Reg; ++Reg) {
+    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) {
       CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
     }
   }
@@ -630,7 +630,7 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
   std::reverse(Sequence.begin(), Sequence.end());
 
 #ifndef NDEBUG
-  VerifySchedule(/*isBottomUp=*/true);
+  VerifyScheduledSequence(/*isBottomUp=*/true);
 #endif
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 1017d36..f44adfc 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -232,7 +232,7 @@ private:
   /// Updates the topological ordering if required.
   SUnit *CreateNewSUnit(SDNode *N) {
     unsigned NumSUnits = SUnits.size();
-    SUnit *NewNode = NewSUnit(N);
+    SUnit *NewNode = newSUnit(N);
     // Update the topological ordering.
     if (NewNode->NodeNum >= NumSUnits)
       Topo.InitDAGTopologicalSorting();
@@ -250,9 +250,9 @@ private:
     return NewNode;
   }
 
-  /// ForceUnitLatencies - Register-pressure-reducing scheduling doesn't
+  /// forceUnitLatencies - Register-pressure-reducing scheduling doesn't
   /// need actual latency information but the hybrid scheduler does.
-  bool ForceUnitLatencies() const {
+  bool forceUnitLatencies() const {
     return !NeedLatency;
   }
 };
@@ -327,6 +327,12 @@ void ScheduleDAGRRList::Schedule() {
   ListScheduleBottomUp();
 
   AvailableQueue->releaseState();
+
+  DEBUG({
+      dbgs() << "*** Final schedule ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
 }
 
 //===----------------------------------------------------------------------===//
@@ -348,7 +354,7 @@ void ScheduleDAGRRList::ReleasePred(SUnit *SU, const SDep *PredEdge) {
 #endif
   --PredSU->NumSuccsLeft;
 
-  if (!ForceUnitLatencies()) {
+  if (!forceUnitLatencies()) {
     // Updating predecessor's height. This is now the cycle when the
     // predecessor can be scheduled without causing a pipeline stall.
     PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge->getLatency());
@@ -695,7 +701,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
 
   Sequence.push_back(SU);
 
-  AvailableQueue->ScheduledNode(SU);
+  AvailableQueue->scheduledNode(SU);
 
   // If HazardRec is disabled, and each inst counts as one cycle, then
   // advance CurCycle before ReleasePredecessors to avoid useless pushes to
@@ -842,7 +848,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
   else {
     AvailableQueue->push(SU);
   }
-  AvailableQueue->UnscheduledNode(SU);
+  AvailableQueue->unscheduledNode(SU);
 }
 
 /// After backtracking, the hazard checker needs to be restored to a state
@@ -963,7 +969,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
       LoadNode->setNodeId(LoadSU->NodeNum);
 
       InitNumRegDefsLeft(LoadSU);
-      ComputeLatency(LoadSU);
+      computeLatency(LoadSU);
     }
 
     SUnit *NewSU = CreateNewSUnit(N);
@@ -981,7 +987,7 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
       NewSU->isCommutable = true;
 
     InitNumRegDefsLeft(NewSU);
-    ComputeLatency(NewSU);
+    computeLatency(NewSU);
 
     // Record all the edges to and from the old SU, by category.
     SmallVector<SDep, 4> ChainPreds;
@@ -1160,7 +1166,7 @@ static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
   assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
   unsigned NumRes = MCID.getNumDefs();
-  for (const unsigned *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+  for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
     if (Reg == *ImpDef)
       break;
     ++NumRes;
@@ -1286,7 +1292,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVector<unsigned, 4> &LRegs) {
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
     if (!MCID.ImplicitDefs)
       continue;
-    for (const unsigned *Reg = MCID.ImplicitDefs; *Reg; ++Reg)
+    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
       CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
   }
 
@@ -1475,7 +1481,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
   std::reverse(Sequence.begin(), Sequence.end());
 
 #ifndef NDEBUG
-  VerifySchedule(/*isBottomUp=*/true);
+  VerifyScheduledSequence(/*isBottomUp=*/true);
 #endif
 }
 
@@ -1681,9 +1687,9 @@ public:
 
   int RegPressureDiff(SUnit *SU, unsigned &LiveUses) const;
 
-  void ScheduledNode(SUnit *SU);
+  void scheduledNode(SUnit *SU);
 
-  void UnscheduledNode(SUnit *SU);
+  void unscheduledNode(SUnit *SU);
 
 protected:
   bool canClobber(const SUnit *SU, const SUnit *Op);
@@ -1984,7 +1990,7 @@ int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
   return PDiff;
 }
 
-void RegReductionPQBase::ScheduledNode(SUnit *SU) {
+void RegReductionPQBase::scheduledNode(SUnit *SU) {
   if (!TracksRegPressure)
     return;
 
@@ -2053,7 +2059,7 @@ void RegReductionPQBase::ScheduledNode(SUnit *SU) {
   dumpRegPressure();
 }
 
-void RegReductionPQBase::UnscheduledNode(SUnit *SU) {
+void RegReductionPQBase::unscheduledNode(SUnit *SU) {
   if (!TracksRegPressure)
     return;
 
@@ -2661,7 +2667,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
                                          ScheduleDAGRRList *scheduleDAG,
                                          const TargetInstrInfo *TII,
                                          const TargetRegisterInfo *TRI) {
-  const unsigned *ImpDefs
+  const uint16_t *ImpDefs
     = TII->get(SU->getNode()->getMachineOpcode()).getImplicitDefs();
   const uint32_t *RegMask = getNodeRegMask(SU->getNode());
   if(!ImpDefs && !RegMask)
@@ -2680,7 +2686,7 @@ static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
         return true;
 
       if (ImpDefs)
-        for (const unsigned *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
+        for (const uint16_t *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
           // Return true if SU clobbers this physical register use and the
           // definition of the register reaches from DepSU. IsReachable queries
           // a topological forward sort of the DAG (following the successors).
@@ -2699,13 +2705,13 @@ static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
                                   const TargetRegisterInfo *TRI) {
   SDNode *N = SuccSU->getNode();
   unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
-  const unsigned *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
+  const uint16_t *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
   assert(ImpDefs && "Caller should check hasPhysRegDefs");
   for (const SDNode *SUNode = SU->getNode(); SUNode;
        SUNode = SUNode->getGluedNode()) {
     if (!SUNode->isMachineOpcode())
       continue;
-    const unsigned *SUImpDefs =
+    const uint16_t *SUImpDefs =
       TII->get(SUNode->getMachineOpcode()).getImplicitDefs();
     const uint32_t *SURegMask = getNodeRegMask(SUNode);
     if (!SUImpDefs && !SURegMask)
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 71f07d6..69dd813 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -17,6 +17,8 @@
 #include "ScheduleDAGSDNodes.h"
 #include "InstrEmitter.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -44,20 +46,26 @@ static cl::opt<int> HighLatencyCycles(
            "instructions take for targets with no itinerary"));
 
 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
-  : ScheduleDAG(mf),
+  : ScheduleDAG(mf), BB(0), DAG(0),
     InstrItins(mf.getTarget().getInstrItineraryData()) {}
 
 /// Run - perform scheduling.
 ///
-void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb,
-                             MachineBasicBlock::iterator insertPos) {
+void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb) {
+  BB = bb;
   DAG = dag;
-  ScheduleDAG::Run(bb, insertPos);
+
+  // Clear the scheduler's SUnit DAG.
+  ScheduleDAG::clearDAG();
+  Sequence.clear();
+
+  // Invoke the target's selection of scheduler.
+  Schedule();
 }
 
 /// NewSUnit - Creates a new SUnit and return a ptr to it.
 ///
-SUnit *ScheduleDAGSDNodes::NewSUnit(SDNode *N) {
+SUnit *ScheduleDAGSDNodes::newSUnit(SDNode *N) {
 #ifndef NDEBUG
   const SUnit *Addr = 0;
   if (!SUnits.empty())
@@ -79,7 +87,7 @@ SUnit *ScheduleDAGSDNodes::NewSUnit(SDNode *N) {
 }
 
 SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
-  SUnit *SU = NewSUnit(Old->getNode());
+  SUnit *SU = newSUnit(Old->getNode());
   SU->OrigNode = Old->OrigNode;
   SU->Latency = Old->Latency;
   SU->isVRegCycle = Old->isVRegCycle;
@@ -302,7 +310,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
     // If this node has already been processed, stop now.
     if (NI->getNodeId() != -1) continue;
 
-    SUnit *NodeSUnit = NewSUnit(NI);
+    SUnit *NodeSUnit = newSUnit(NI);
 
     // See if anything is glued to this node, if so, add them to glued
     // nodes.  Nodes can have at most one glue input and one glue output.  Glue
@@ -360,7 +368,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
     InitNumRegDefsLeft(NodeSUnit);
 
     // Assign the Latency field of NodeSUnit using target-provided information.
-    ComputeLatency(NodeSUnit);
+    computeLatency(NodeSUnit);
   }
 
   // Find all call operands.
@@ -382,7 +390,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
   const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
 
   // Check to see if the scheduler cares about latencies.
-  bool UnitLatencies = ForceUnitLatencies();
+  bool UnitLatencies = forceUnitLatencies();
 
   // Pass 2: add the preds, succs, etc.
   for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
@@ -448,7 +456,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
                                OpLatency, PhysReg);
         if (!isChain && !UnitLatencies) {
-          ComputeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
+          computeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
           ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep));
         }
 
@@ -541,7 +549,7 @@ void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
   }
 }
 
-void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
+void ScheduleDAGSDNodes::computeLatency(SUnit *SU) {
   SDNode *N = SU->getNode();
 
   // TokenFactor operands are considered zero latency, and some schedulers
@@ -553,7 +561,7 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
   }
 
   // Check to see if the scheduler cares about latencies.
-  if (ForceUnitLatencies()) {
+  if (forceUnitLatencies()) {
     SU->Latency = 1;
     return;
   }
@@ -575,10 +583,10 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
       SU->Latency += TII->getInstrLatency(InstrItins, N);
 }
 
-void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
+void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
                                                unsigned OpIdx, SDep& dep) const{
   // Check to see if the scheduler cares about latencies.
-  if (ForceUnitLatencies())
+  if (forceUnitLatencies())
     return;
 
   if (dep.getKind() != SDep::Data)
@@ -621,6 +629,30 @@ void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
   }
 }
 
+void ScheduleDAGSDNodes::dumpSchedule() const {
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      SU->dump(this);
+    else
+      dbgs() << "**** NOOP ****\n";
+  }
+}
+
+#ifndef NDEBUG
+/// VerifyScheduledSequence - Verify that all SUnits were scheduled and that
+/// their state is consistent with the nodes listed in Sequence.
+///
+void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
+  unsigned ScheduledNodes = ScheduleDAG::VerifyScheduledDAG(isBottomUp);
+  unsigned Noops = 0;
+  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
+    if (!Sequence[i])
+      ++Noops;
+  assert(Sequence.size() - Noops == ScheduledNodes &&
+         "The number of nodes scheduled doesn't match the expected number!");
+}
+#endif // NDEBUG
+
 namespace {
   struct OrderSorter {
     bool operator()(const std::pair<unsigned, MachineInstr*> &A,
@@ -686,9 +718,48 @@ static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
   ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, Order);
 }
 
+void ScheduleDAGSDNodes::
+EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
+                MachineBasicBlock::iterator InsertPos) {
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    if (I->getSUnit()->CopyDstRC) {
+      // Copy to physical register.
+      DenseMap<SUnit*, unsigned>::iterator VRI = VRBaseMap.find(I->getSUnit());
+      assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
+      // Find the destination physical register.
+      unsigned Reg = 0;
+      for (SUnit::const_succ_iterator II = SU->Succs.begin(),
+             EE = SU->Succs.end(); II != EE; ++II) {
+        if (II->isCtrl()) continue;  // ignore chain preds
+        if (II->getReg()) {
+          Reg = II->getReg();
+          break;
+        }
+      }
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(VRI->second);
+    } else {
+      // Copy from physical register.
+      assert(I->getReg() && "Unknown physical register!");
+      unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
+      bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
+      (void)isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
+        .addReg(I->getReg());
+    }
+    break;
+  }
+}
 
-/// EmitSchedule - Emit the machine code in scheduled order.
-MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
+/// EmitSchedule - Emit the machine code in scheduled order. Return the new
+/// InsertPos and MachineBasicBlock that contains this insertion
+/// point. ScheduleDAGSDNodes holds a BB pointer for convenience, but this does
+/// not necessarily refer to returned BB. The emitter may split blocks.
+MachineBasicBlock *ScheduleDAGSDNodes::
+EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
   InstrEmitter Emitter(BB, InsertPos);
   DenseMap<SDValue, unsigned> VRBaseMap;
   DenseMap<SUnit*, unsigned> CopyVRBaseMap;
@@ -711,7 +782,7 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
     SUnit *SU = Sequence[i];
     if (!SU) {
       // Null SUnit* is a noop.
-      EmitNoop();
+      TII->insertNoop(*Emitter.getBlock(), InsertPos);
       continue;
     }
 
@@ -719,7 +790,7 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
     // SDNode and any glued SDNodes and append them to the block.
     if (!SU->getNode()) {
       // Emit a copy.
-      EmitPhysRegCopy(SU, CopyVRBaseMap);
+      EmitPhysRegCopy(SU, CopyVRBaseMap, InsertPos);
       continue;
     }
 
@@ -784,19 +855,24 @@ MachineBasicBlock *ScheduleDAGSDNodes::EmitSchedule() {
     }
     // Add trailing DbgValue's before the terminator. FIXME: May want to add
     // some of them before one or more conditional branches?
+    SmallVector<MachineInstr*, 8> DbgMIs;
     while (DI != DE) {
-      MachineBasicBlock *InsertBB = Emitter.getBlock();
-      MachineBasicBlock::iterator Pos= Emitter.getBlock()->getFirstTerminator();
-      if (!(*DI)->isInvalidated()) {
-        MachineInstr *DbgMI= Emitter.EmitDbgValue(*DI, VRBaseMap);
-        if (DbgMI)
-          InsertBB->insert(Pos, DbgMI);
-      }
+      if (!(*DI)->isInvalidated())
+        if (MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap))
+          DbgMIs.push_back(DbgMI);
       ++DI;
     }
+
+    MachineBasicBlock *InsertBB = Emitter.getBlock();
+    MachineBasicBlock::iterator Pos = InsertBB->getFirstTerminator();
+    InsertBB->insert(Pos, DbgMIs.begin(), DbgMIs.end());
   }
 
-  BB = Emitter.getBlock();
   InsertPos = Emitter.getInsertPos();
-  return BB;
+  return Emitter.getBlock();
+}
+
+/// Return the basic block label.
+std::string ScheduleDAGSDNodes::getDAGName() const {
+  return "sunit-dag." + BB->getFullName();
 }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 17b4901..75940ec 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -35,17 +35,20 @@ namespace llvm {
   ///
   class ScheduleDAGSDNodes : public ScheduleDAG {
   public:
+    MachineBasicBlock *BB;
     SelectionDAG *DAG;                    // DAG of the current basic block
     const InstrItineraryData *InstrItins;
 
+    /// The schedule. Null SUnit*'s represent noop instructions.
+    std::vector<SUnit*> Sequence;
+
     explicit ScheduleDAGSDNodes(MachineFunction &mf);
 
     virtual ~ScheduleDAGSDNodes() {}
 
     /// Run - perform scheduling.
     ///
-    void Run(SelectionDAG *dag, MachineBasicBlock *bb,
-             MachineBasicBlock::iterator insertPos);
+    void Run(SelectionDAG *dag, MachineBasicBlock *bb);
 
     /// isPassiveNode - Return true if the node is a non-scheduled leaf.
     ///
@@ -68,7 +71,7 @@ namespace llvm {
 
     /// NewSUnit - Creates a new SUnit and return a ptr to it.
     ///
-    SUnit *NewSUnit(SDNode *N);
+    SUnit *newSUnit(SDNode *N);
 
     /// Clone - Creates a clone of the specified SUnit. It does not copy the
     /// predecessors / successors info nor the temporary scheduling states.
@@ -79,7 +82,7 @@ namespace llvm {
     /// are input.  This SUnit graph is similar to the SelectionDAG, but
     /// excludes nodes that aren't interesting to scheduling, and represents
     /// flagged together nodes with a single SUnit.
-    virtual void BuildSchedGraph(AliasAnalysis *AA);
+    void BuildSchedGraph(AliasAnalysis *AA);
 
     /// InitVRegCycleFlag - Set isVRegCycle if this node's single use is
     /// CopyToReg and its only active data operands are CopyFromReg within a
@@ -91,30 +94,41 @@ namespace llvm {
     ///
     void InitNumRegDefsLeft(SUnit *SU);
 
-    /// ComputeLatency - Compute node latency.
+    /// computeLatency - Compute node latency.
     ///
-    virtual void ComputeLatency(SUnit *SU);
+    virtual void computeLatency(SUnit *SU);
 
-    /// ComputeOperandLatency - Override dependence edge latency using
+    /// computeOperandLatency - Override dependence edge latency using
     /// operand use/def information
     ///
-    virtual void ComputeOperandLatency(SUnit *Def, SUnit *Use,
+    virtual void computeOperandLatency(SUnit *Def, SUnit *Use,
                                        SDep& dep) const { }
 
-    virtual void ComputeOperandLatency(SDNode *Def, SDNode *Use,
+    virtual void computeOperandLatency(SDNode *Def, SDNode *Use,
                                        unsigned OpIdx, SDep& dep) const;
 
-    virtual MachineBasicBlock *EmitSchedule();
-
     /// Schedule - Order nodes according to selected style, filling
     /// in the Sequence member.
     ///
     virtual void Schedule() = 0;
 
+    /// VerifyScheduledSequence - Verify that all SUnits are scheduled and
+    /// consistent with the Sequence of scheduled instructions.
+    void VerifyScheduledSequence(bool isBottomUp);
+
+    /// EmitSchedule - Insert MachineInstrs into the MachineBasicBlock
+    /// according to the order specified in Sequence.
+    ///
+    MachineBasicBlock *EmitSchedule(MachineBasicBlock::iterator &InsertPos);
+
     virtual void dumpNode(const SUnit *SU) const;
 
+    void dumpSchedule() const;
+
     virtual std::string getGraphNodeLabel(const SUnit *SU) const;
 
+    virtual std::string getDAGName() const;
+
     virtual void getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const;
 
     /// RegDefIter - In place iteration over the values defined by an
@@ -160,6 +174,9 @@ namespace llvm {
     /// BuildSchedUnits, AddSchedEdges - Helper functions for BuildSchedGraph.
     void BuildSchedUnits();
     void AddSchedEdges();
+
+    void EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
+                         MachineBasicBlock::iterator InsertPos);
   };
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 7d12509..c851291 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -158,7 +158,7 @@ void ScheduleDAGVLIW::scheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
 
   releaseSuccessors(SU);
   SU->isScheduled = true;
-  AvailableQueue->ScheduledNode(SU);
+  AvailableQueue->scheduledNode(SU);
 }
 
 /// listScheduleTopDown - The main loop of list scheduling for top-down
@@ -202,7 +202,7 @@ void ScheduleDAGVLIW::listScheduleTopDown() {
     // don't advance the hazard recognizer.
     if (AvailableQueue->empty()) {
       // Reset DFA state.
-      AvailableQueue->ScheduledNode(0);
+      AvailableQueue->scheduledNode(0);
       ++CurCycle;
       continue;
     }
@@ -261,7 +261,7 @@ void ScheduleDAGVLIW::listScheduleTopDown() {
   }
 
 #ifndef NDEBUG
-  VerifySchedule(/*isBottomUp=*/false);
+  VerifyScheduledSequence(/*isBottomUp=*/false);
 #endif
 }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 796abf4..e3a7305 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -124,20 +124,29 @@ bool ISD::isBuildVectorAllOnes(const SDNode *N) {
   if (i == e) return false;
 
   // Do not accept build_vectors that aren't all constants or which have non-~0
-  // elements.
+  // elements. We have to be a bit careful here, as the type of the constant
+  // may not be the same as the type of the vector elements due to type
+  // legalization (the elements are promoted to a legal type for the target and
+  // a vector of a type may be legal when the base element type is not).
+  // We only want to check enough bits to cover the vector elements, because
+  // we care if the resultant vector is all ones, not whether the individual
+  // constants are.
   SDValue NotZero = N->getOperand(i);
+  unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
   if (isa<ConstantSDNode>(NotZero)) {
-    if (!cast<ConstantSDNode>(NotZero)->isAllOnesValue())
+    if (cast<ConstantSDNode>(NotZero)->getAPIntValue().countTrailingOnes() <
+        EltSize)
       return false;
   } else if (isa<ConstantFPSDNode>(NotZero)) {
-    if (!cast<ConstantFPSDNode>(NotZero)->getValueAPF().
-                bitcastToAPInt().isAllOnesValue())
+    if (cast<ConstantFPSDNode>(NotZero)->getValueAPF()
+              .bitcastToAPInt().countTrailingOnes() < EltSize)
       return false;
   } else
     return false;
 
   // Okay, we have at least one ~0 value, check to see if the rest match or are
-  // undefs.
+  // undefs. Even with the above element type twiddling, this should be OK, as
+  // the same type legalization should have applied to all the elements.
   for (++i; i != e; ++i)
     if (N->getOperand(i) != NotZero &&
         N->getOperand(i).getOpcode() != ISD::UNDEF)
@@ -5904,571 +5913,6 @@ uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
   return cast<ConstantSDNode>(OperandList[Num])->getZExtValue();
 }
 
-std::string SDNode::getOperationName(const SelectionDAG *G) const {
-  switch (getOpcode()) {
-  default:
-    if (getOpcode() < ISD::BUILTIN_OP_END)
-      return "<<Unknown DAG Node>>";
-    if (isMachineOpcode()) {
-      if (G)
-        if (const TargetInstrInfo *TII = G->getTarget().getInstrInfo())
-          if (getMachineOpcode() < TII->getNumOpcodes())
-            return TII->getName(getMachineOpcode());
-      return "<<Unknown Machine Node #" + utostr(getOpcode()) + ">>";
-    }
-    if (G) {
-      const TargetLowering &TLI = G->getTargetLoweringInfo();
-      const char *Name = TLI.getTargetNodeName(getOpcode());
-      if (Name) return Name;
-      return "<<Unknown Target Node #" + utostr(getOpcode()) + ">>";
-    }
-    return "<<Unknown Node #" + utostr(getOpcode()) + ">>";
-
-#ifndef NDEBUG
-  case ISD::DELETED_NODE:
-    return "<<Deleted Node!>>";
-#endif
-  case ISD::PREFETCH:      return "Prefetch";
-  case ISD::MEMBARRIER:    return "MemBarrier";
-  case ISD::ATOMIC_FENCE:    return "AtomicFence";
-  case ISD::ATOMIC_CMP_SWAP:    return "AtomicCmpSwap";
-  case ISD::ATOMIC_SWAP:        return "AtomicSwap";
-  case ISD::ATOMIC_LOAD_ADD:    return "AtomicLoadAdd";
-  case ISD::ATOMIC_LOAD_SUB:    return "AtomicLoadSub";
-  case ISD::ATOMIC_LOAD_AND:    return "AtomicLoadAnd";
-  case ISD::ATOMIC_LOAD_OR:     return "AtomicLoadOr";
-  case ISD::ATOMIC_LOAD_XOR:    return "AtomicLoadXor";
-  case ISD::ATOMIC_LOAD_NAND:   return "AtomicLoadNand";
-  case ISD::ATOMIC_LOAD_MIN:    return "AtomicLoadMin";
-  case ISD::ATOMIC_LOAD_MAX:    return "AtomicLoadMax";
-  case ISD::ATOMIC_LOAD_UMIN:   return "AtomicLoadUMin";
-  case ISD::ATOMIC_LOAD_UMAX:   return "AtomicLoadUMax";
-  case ISD::ATOMIC_LOAD:        return "AtomicLoad";
-  case ISD::ATOMIC_STORE:       return "AtomicStore";
-  case ISD::PCMARKER:      return "PCMarker";
-  case ISD::READCYCLECOUNTER: return "ReadCycleCounter";
-  case ISD::SRCVALUE:      return "SrcValue";
-  case ISD::MDNODE_SDNODE: return "MDNode";
-  case ISD::EntryToken:    return "EntryToken";
-  case ISD::TokenFactor:   return "TokenFactor";
-  case ISD::AssertSext:    return "AssertSext";
-  case ISD::AssertZext:    return "AssertZext";
-
-  case ISD::BasicBlock:    return "BasicBlock";
-  case ISD::VALUETYPE:     return "ValueType";
-  case ISD::Register:      return "Register";
-  case ISD::RegisterMask:  return "RegisterMask";
-  case ISD::Constant:      return "Constant";
-  case ISD::ConstantFP:    return "ConstantFP";
-  case ISD::GlobalAddress: return "GlobalAddress";
-  case ISD::GlobalTLSAddress: return "GlobalTLSAddress";
-  case ISD::FrameIndex:    return "FrameIndex";
-  case ISD::JumpTable:     return "JumpTable";
-  case ISD::GLOBAL_OFFSET_TABLE: return "GLOBAL_OFFSET_TABLE";
-  case ISD::RETURNADDR: return "RETURNADDR";
-  case ISD::FRAMEADDR: return "FRAMEADDR";
-  case ISD::FRAME_TO_ARGS_OFFSET: return "FRAME_TO_ARGS_OFFSET";
-  case ISD::EXCEPTIONADDR: return "EXCEPTIONADDR";
-  case ISD::LSDAADDR: return "LSDAADDR";
-  case ISD::EHSELECTION: return "EHSELECTION";
-  case ISD::EH_RETURN: return "EH_RETURN";
-  case ISD::EH_SJLJ_SETJMP: return "EH_SJLJ_SETJMP";
-  case ISD::EH_SJLJ_LONGJMP: return "EH_SJLJ_LONGJMP";
-  case ISD::ConstantPool:  return "ConstantPool";
-  case ISD::ExternalSymbol: return "ExternalSymbol";
-  case ISD::BlockAddress:  return "BlockAddress";
-  case ISD::INTRINSIC_WO_CHAIN:
-  case ISD::INTRINSIC_VOID:
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned OpNo = getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 0 : 1;
-    unsigned IID = cast<ConstantSDNode>(getOperand(OpNo))->getZExtValue();
-    if (IID < Intrinsic::num_intrinsics)
-      return Intrinsic::getName((Intrinsic::ID)IID);
-    else if (const TargetIntrinsicInfo *TII = G->getTarget().getIntrinsicInfo())
-      return TII->getName(IID);
-    llvm_unreachable("Invalid intrinsic ID");
-  }
-
-  case ISD::BUILD_VECTOR:   return "BUILD_VECTOR";
-  case ISD::TargetConstant: return "TargetConstant";
-  case ISD::TargetConstantFP:return "TargetConstantFP";
-  case ISD::TargetGlobalAddress: return "TargetGlobalAddress";
-  case ISD::TargetGlobalTLSAddress: return "TargetGlobalTLSAddress";
-  case ISD::TargetFrameIndex: return "TargetFrameIndex";
-  case ISD::TargetJumpTable:  return "TargetJumpTable";
-  case ISD::TargetConstantPool:  return "TargetConstantPool";
-  case ISD::TargetExternalSymbol: return "TargetExternalSymbol";
-  case ISD::TargetBlockAddress: return "TargetBlockAddress";
-
-  case ISD::CopyToReg:     return "CopyToReg";
-  case ISD::CopyFromReg:   return "CopyFromReg";
-  case ISD::UNDEF:         return "undef";
-  case ISD::MERGE_VALUES:  return "merge_values";
-  case ISD::INLINEASM:     return "inlineasm";
-  case ISD::EH_LABEL:      return "eh_label";
-  case ISD::HANDLENODE:    return "handlenode";
-
-  // Unary operators
-  case ISD::FABS:   return "fabs";
-  case ISD::FNEG:   return "fneg";
-  case ISD::FSQRT:  return "fsqrt";
-  case ISD::FSIN:   return "fsin";
-  case ISD::FCOS:   return "fcos";
-  case ISD::FTRUNC: return "ftrunc";
-  case ISD::FFLOOR: return "ffloor";
-  case ISD::FCEIL:  return "fceil";
-  case ISD::FRINT:  return "frint";
-  case ISD::FNEARBYINT: return "fnearbyint";
-  case ISD::FEXP:   return "fexp";
-  case ISD::FEXP2:  return "fexp2";
-  case ISD::FLOG:   return "flog";
-  case ISD::FLOG2:  return "flog2";
-  case ISD::FLOG10: return "flog10";
-
-  // Binary operators
-  case ISD::ADD:    return "add";
-  case ISD::SUB:    return "sub";
-  case ISD::MUL:    return "mul";
-  case ISD::MULHU:  return "mulhu";
-  case ISD::MULHS:  return "mulhs";
-  case ISD::SDIV:   return "sdiv";
-  case ISD::UDIV:   return "udiv";
-  case ISD::SREM:   return "srem";
-  case ISD::UREM:   return "urem";
-  case ISD::SMUL_LOHI:  return "smul_lohi";
-  case ISD::UMUL_LOHI:  return "umul_lohi";
-  case ISD::SDIVREM:    return "sdivrem";
-  case ISD::UDIVREM:    return "udivrem";
-  case ISD::AND:    return "and";
-  case ISD::OR:     return "or";
-  case ISD::XOR:    return "xor";
-  case ISD::SHL:    return "shl";
-  case ISD::SRA:    return "sra";
-  case ISD::SRL:    return "srl";
-  case ISD::ROTL:   return "rotl";
-  case ISD::ROTR:   return "rotr";
-  case ISD::FADD:   return "fadd";
-  case ISD::FSUB:   return "fsub";
-  case ISD::FMUL:   return "fmul";
-  case ISD::FDIV:   return "fdiv";
-  case ISD::FMA:    return "fma";
-  case ISD::FREM:   return "frem";
-  case ISD::FCOPYSIGN: return "fcopysign";
-  case ISD::FGETSIGN:  return "fgetsign";
-  case ISD::FPOW:   return "fpow";
-
-  case ISD::FPOWI:  return "fpowi";
-  case ISD::SETCC:       return "setcc";
-  case ISD::SELECT:      return "select";
-  case ISD::VSELECT:     return "vselect";
-  case ISD::SELECT_CC:   return "select_cc";
-  case ISD::INSERT_VECTOR_ELT:   return "insert_vector_elt";
-  case ISD::EXTRACT_VECTOR_ELT:  return "extract_vector_elt";
-  case ISD::CONCAT_VECTORS:      return "concat_vectors";
-  case ISD::INSERT_SUBVECTOR:    return "insert_subvector";
-  case ISD::EXTRACT_SUBVECTOR:   return "extract_subvector";
-  case ISD::SCALAR_TO_VECTOR:    return "scalar_to_vector";
-  case ISD::VECTOR_SHUFFLE:      return "vector_shuffle";
-  case ISD::CARRY_FALSE:         return "carry_false";
-  case ISD::ADDC:        return "addc";
-  case ISD::ADDE:        return "adde";
-  case ISD::SADDO:       return "saddo";
-  case ISD::UADDO:       return "uaddo";
-  case ISD::SSUBO:       return "ssubo";
-  case ISD::USUBO:       return "usubo";
-  case ISD::SMULO:       return "smulo";
-  case ISD::UMULO:       return "umulo";
-  case ISD::SUBC:        return "subc";
-  case ISD::SUBE:        return "sube";
-  case ISD::SHL_PARTS:   return "shl_parts";
-  case ISD::SRA_PARTS:   return "sra_parts";
-  case ISD::SRL_PARTS:   return "srl_parts";
-
-  // Conversion operators.
-  case ISD::SIGN_EXTEND: return "sign_extend";
-  case ISD::ZERO_EXTEND: return "zero_extend";
-  case ISD::ANY_EXTEND:  return "any_extend";
-  case ISD::SIGN_EXTEND_INREG: return "sign_extend_inreg";
-  case ISD::TRUNCATE:    return "truncate";
-  case ISD::FP_ROUND:    return "fp_round";
-  case ISD::FLT_ROUNDS_: return "flt_rounds";
-  case ISD::FP_ROUND_INREG: return "fp_round_inreg";
-  case ISD::FP_EXTEND:   return "fp_extend";
-
-  case ISD::SINT_TO_FP:  return "sint_to_fp";
-  case ISD::UINT_TO_FP:  return "uint_to_fp";
-  case ISD::FP_TO_SINT:  return "fp_to_sint";
-  case ISD::FP_TO_UINT:  return "fp_to_uint";
-  case ISD::BITCAST:     return "bitcast";
-  case ISD::FP16_TO_FP32: return "fp16_to_fp32";
-  case ISD::FP32_TO_FP16: return "fp32_to_fp16";
-
-  case ISD::CONVERT_RNDSAT: {
-    switch (cast<CvtRndSatSDNode>(this)->getCvtCode()) {
-    default: llvm_unreachable("Unknown cvt code!");
-    case ISD::CVT_FF:  return "cvt_ff";
-    case ISD::CVT_FS:  return "cvt_fs";
-    case ISD::CVT_FU:  return "cvt_fu";
-    case ISD::CVT_SF:  return "cvt_sf";
-    case ISD::CVT_UF:  return "cvt_uf";
-    case ISD::CVT_SS:  return "cvt_ss";
-    case ISD::CVT_SU:  return "cvt_su";
-    case ISD::CVT_US:  return "cvt_us";
-    case ISD::CVT_UU:  return "cvt_uu";
-    }
-  }
-
-    // Control flow instructions
-  case ISD::BR:      return "br";
-  case ISD::BRIND:   return "brind";
-  case ISD::BR_JT:   return "br_jt";
-  case ISD::BRCOND:  return "brcond";
-  case ISD::BR_CC:   return "br_cc";
-  case ISD::CALLSEQ_START:  return "callseq_start";
-  case ISD::CALLSEQ_END:    return "callseq_end";
-
-    // Other operators
-  case ISD::LOAD:               return "load";
-  case ISD::STORE:              return "store";
-  case ISD::VAARG:              return "vaarg";
-  case ISD::VACOPY:             return "vacopy";
-  case ISD::VAEND:              return "vaend";
-  case ISD::VASTART:            return "vastart";
-  case ISD::DYNAMIC_STACKALLOC: return "dynamic_stackalloc";
-  case ISD::EXTRACT_ELEMENT:    return "extract_element";
-  case ISD::BUILD_PAIR:         return "build_pair";
-  case ISD::STACKSAVE:          return "stacksave";
-  case ISD::STACKRESTORE:       return "stackrestore";
-  case ISD::TRAP:               return "trap";
-
-  // Bit manipulation
-  case ISD::BSWAP:           return "bswap";
-  case ISD::CTPOP:           return "ctpop";
-  case ISD::CTTZ:            return "cttz";
-  case ISD::CTTZ_ZERO_UNDEF: return "cttz_zero_undef";
-  case ISD::CTLZ:            return "ctlz";
-  case ISD::CTLZ_ZERO_UNDEF: return "ctlz_zero_undef";
-
-  // Trampolines
-  case ISD::INIT_TRAMPOLINE: return "init_trampoline";
-  case ISD::ADJUST_TRAMPOLINE: return "adjust_trampoline";
-
-  case ISD::CONDCODE:
-    switch (cast<CondCodeSDNode>(this)->get()) {
-    default: llvm_unreachable("Unknown setcc condition!");
-    case ISD::SETOEQ:  return "setoeq";
-    case ISD::SETOGT:  return "setogt";
-    case ISD::SETOGE:  return "setoge";
-    case ISD::SETOLT:  return "setolt";
-    case ISD::SETOLE:  return "setole";
-    case ISD::SETONE:  return "setone";
-
-    case ISD::SETO:    return "seto";
-    case ISD::SETUO:   return "setuo";
-    case ISD::SETUEQ:  return "setue";
-    case ISD::SETUGT:  return "setugt";
-    case ISD::SETUGE:  return "setuge";
-    case ISD::SETULT:  return "setult";
-    case ISD::SETULE:  return "setule";
-    case ISD::SETUNE:  return "setune";
-
-    case ISD::SETEQ:   return "seteq";
-    case ISD::SETGT:   return "setgt";
-    case ISD::SETGE:   return "setge";
-    case ISD::SETLT:   return "setlt";
-    case ISD::SETLE:   return "setle";
-    case ISD::SETNE:   return "setne";
-
-    case ISD::SETTRUE:   return "settrue";
-    case ISD::SETTRUE2:  return "settrue2";
-    case ISD::SETFALSE:  return "setfalse";
-    case ISD::SETFALSE2: return "setfalse2";
-    }
-  }
-}
-
-const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
-  switch (AM) {
-  default:
-    return "";
-  case ISD::PRE_INC:
-    return "<pre-inc>";
-  case ISD::PRE_DEC:
-    return "<pre-dec>";
-  case ISD::POST_INC:
-    return "<post-inc>";
-  case ISD::POST_DEC:
-    return "<post-dec>";
-  }
-}
-
-std::string ISD::ArgFlagsTy::getArgFlagsString() {
-  std::string S = "< ";
-
-  if (isZExt())
-    S += "zext ";
-  if (isSExt())
-    S += "sext ";
-  if (isInReg())
-    S += "inreg ";
-  if (isSRet())
-    S += "sret ";
-  if (isByVal())
-    S += "byval ";
-  if (isNest())
-    S += "nest ";
-  if (getByValAlign())
-    S += "byval-align:" + utostr(getByValAlign()) + " ";
-  if (getOrigAlign())
-    S += "orig-align:" + utostr(getOrigAlign()) + " ";
-  if (getByValSize())
-    S += "byval-size:" + utostr(getByValSize()) + " ";
-  return S + ">";
-}
-
-void SDNode::dump() const { dump(0); }
-void SDNode::dump(const SelectionDAG *G) const {
-  print(dbgs(), G);
-  dbgs() << '\n';
-}
-
-void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
-  OS << (void*)this << ": ";
-
-  for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
-    if (i) OS << ",";
-    if (getValueType(i) == MVT::Other)
-      OS << "ch";
-    else
-      OS << getValueType(i).getEVTString();
-  }
-  OS << " = " << getOperationName(G);
-}
-
-void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
-  if (const MachineSDNode *MN = dyn_cast<MachineSDNode>(this)) {
-    if (!MN->memoperands_empty()) {
-      OS << "<";
-      OS << "Mem:";
-      for (MachineSDNode::mmo_iterator i = MN->memoperands_begin(),
-           e = MN->memoperands_end(); i != e; ++i) {
-        OS << **i;
-        if (llvm::next(i) != e)
-          OS << " ";
-      }
-      OS << ">";
-    }
-  } else if (const ShuffleVectorSDNode *SVN =
-               dyn_cast<ShuffleVectorSDNode>(this)) {
-    OS << "<";
-    for (unsigned i = 0, e = ValueList[0].getVectorNumElements(); i != e; ++i) {
-      int Idx = SVN->getMaskElt(i);
-      if (i) OS << ",";
-      if (Idx < 0)
-        OS << "u";
-      else
-        OS << Idx;
-    }
-    OS << ">";
-  } else if (const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(this)) {
-    OS << '<' << CSDN->getAPIntValue() << '>';
-  } else if (const ConstantFPSDNode *CSDN = dyn_cast<ConstantFPSDNode>(this)) {
-    if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEsingle)
-      OS << '<' << CSDN->getValueAPF().convertToFloat() << '>';
-    else if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEdouble)
-      OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';
-    else {
-      OS << "<APFloat(";
-      CSDN->getValueAPF().bitcastToAPInt().dump();
-      OS << ")>";
-    }
-  } else if (const GlobalAddressSDNode *GADN =
-             dyn_cast<GlobalAddressSDNode>(this)) {
-    int64_t offset = GADN->getOffset();
-    OS << '<';
-    WriteAsOperand(OS, GADN->getGlobal());
-    OS << '>';
-    if (offset > 0)
-      OS << " + " << offset;
-    else
-      OS << " " << offset;
-    if (unsigned int TF = GADN->getTargetFlags())
-      OS << " [TF=" << TF << ']';
-  } else if (const FrameIndexSDNode *FIDN = dyn_cast<FrameIndexSDNode>(this)) {
-    OS << "<" << FIDN->getIndex() << ">";
-  } else if (const JumpTableSDNode *JTDN = dyn_cast<JumpTableSDNode>(this)) {
-    OS << "<" << JTDN->getIndex() << ">";
-    if (unsigned int TF = JTDN->getTargetFlags())
-      OS << " [TF=" << TF << ']';
-  } else if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(this)){
-    int offset = CP->getOffset();
-    if (CP->isMachineConstantPoolEntry())
-      OS << "<" << *CP->getMachineCPVal() << ">";
-    else
-      OS << "<" << *CP->getConstVal() << ">";
-    if (offset > 0)
-      OS << " + " << offset;
-    else
-      OS << " " << offset;
-    if (unsigned int TF = CP->getTargetFlags())
-      OS << " [TF=" << TF << ']';
-  } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(this)) {
-    OS << "<";
-    const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
-    if (LBB)
-      OS << LBB->getName() << " ";
-    OS << (const void*)BBDN->getBasicBlock() << ">";
-  } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
-    OS << ' ' << PrintReg(R->getReg(), G ? G->getTarget().getRegisterInfo() :0);
-  } else if (const ExternalSymbolSDNode *ES =
-             dyn_cast<ExternalSymbolSDNode>(this)) {
-    OS << "'" << ES->getSymbol() << "'";
-    if (unsigned int TF = ES->getTargetFlags())
-      OS << " [TF=" << TF << ']';
-  } else if (const SrcValueSDNode *M = dyn_cast<SrcValueSDNode>(this)) {
-    if (M->getValue())
-      OS << "<" << M->getValue() << ">";
-    else
-      OS << "<null>";
-  } else if (const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(this)) {
-    if (MD->getMD())
-      OS << "<" << MD->getMD() << ">";
-    else
-      OS << "<null>";
-  } else if (const VTSDNode *N = dyn_cast<VTSDNode>(this)) {
-    OS << ":" << N->getVT().getEVTString();
-  }
-  else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(this)) {
-    OS << "<" << *LD->getMemOperand();
-
-    bool doExt = true;
-    switch (LD->getExtensionType()) {
-    default: doExt = false; break;
-    case ISD::EXTLOAD: OS << ", anyext"; break;
-    case ISD::SEXTLOAD: OS << ", sext"; break;
-    case ISD::ZEXTLOAD: OS << ", zext"; break;
-    }
-    if (doExt)
-      OS << " from " << LD->getMemoryVT().getEVTString();
-
-    const char *AM = getIndexedModeName(LD->getAddressingMode());
-    if (*AM)
-      OS << ", " << AM;
-
-    OS << ">";
-  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(this)) {
-    OS << "<" << *ST->getMemOperand();
-
-    if (ST->isTruncatingStore())
-      OS << ", trunc to " << ST->getMemoryVT().getEVTString();
-
-    const char *AM = getIndexedModeName(ST->getAddressingMode());
-    if (*AM)
-      OS << ", " << AM;
-
-    OS << ">";
-  } else if (const MemSDNode* M = dyn_cast<MemSDNode>(this)) {
-    OS << "<" << *M->getMemOperand() << ">";
-  } else if (const BlockAddressSDNode *BA =
-               dyn_cast<BlockAddressSDNode>(this)) {
-    OS << "<";
-    WriteAsOperand(OS, BA->getBlockAddress()->getFunction(), false);
-    OS << ", ";
-    WriteAsOperand(OS, BA->getBlockAddress()->getBasicBlock(), false);
-    OS << ">";
-    if (unsigned int TF = BA->getTargetFlags())
-      OS << " [TF=" << TF << ']';
-  }
-
-  if (G)
-    if (unsigned Order = G->GetOrdering(this))
-      OS << " [ORD=" << Order << ']';
-
-  if (getNodeId() != -1)
-    OS << " [ID=" << getNodeId() << ']';
-
-  DebugLoc dl = getDebugLoc();
-  if (G && !dl.isUnknown()) {
-    DIScope
-      Scope(dl.getScope(G->getMachineFunction().getFunction()->getContext()));
-    OS << " dbg:";
-    // Omit the directory, since it's usually long and uninteresting.
-    if (Scope.Verify())
-      OS << Scope.getFilename();
-    else
-      OS << "<unknown>";
-    OS << ':' << dl.getLine();
-    if (dl.getCol() != 0)
-      OS << ':' << dl.getCol();
-  }
-}
-
-void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
-  print_types(OS, G);
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    if (i) OS << ", "; else OS << " ";
-    OS << (void*)getOperand(i).getNode();
-    if (unsigned RN = getOperand(i).getResNo())
-      OS << ":" << RN;
-  }
-  print_details(OS, G);
-}
-
-static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
-                                  const SelectionDAG *G, unsigned depth,
-                                  unsigned indent) {
-  if (depth == 0)
-    return;
-
-  OS.indent(indent);
-
-  N->print(OS, G);
-
-  if (depth < 1)
-    return;
-
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    // Don't follow chain operands.
-    if (N->getOperand(i).getValueType() == MVT::Other)
-      continue;
-    OS << '\n';
-    printrWithDepthHelper(OS, N->getOperand(i).getNode(), G, depth-1, indent+2);
-  }
-}
-
-void SDNode::printrWithDepth(raw_ostream &OS, const SelectionDAG *G,
-                            unsigned depth) const {
-  printrWithDepthHelper(OS, this, G, depth, 0);
-}
-
-void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const {
-  // Don't print impossibly deep things.
-  printrWithDepth(OS, G, 10);
-}
-
-void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const {
-  printrWithDepth(dbgs(), G, depth);
-}
-
-void SDNode::dumprFull(const SelectionDAG *G) const {
-  // Don't print impossibly deep things.
-  dumprWithDepth(G, 10);
-}
-
-static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    if (N->getOperand(i).getNode()->hasOneUse())
-      DumpNodes(N->getOperand(i).getNode(), indent+2, G);
-    else
-      dbgs() << "\n" << std::string(indent+2, ' ')
-           << (void*)N->getOperand(i).getNode() << ": <multiple use>";
-
-
-  dbgs() << "\n";
-  dbgs().indent(indent);
-  N->dump(G);
-}
-
 SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
   assert(N->getNumValues() == 1 &&
          "Can't unroll a vector with multiple results!");
@@ -6625,74 +6069,6 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   return 0;
 }
 
-void SelectionDAG::dump() const {
-  dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:";
-
-  for (allnodes_const_iterator I = allnodes_begin(), E = allnodes_end();
-       I != E; ++I) {
-    const SDNode *N = I;
-    if (!N->hasOneUse() && N != getRoot().getNode())
-      DumpNodes(N, 2, this);
-  }
-
-  if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
-
-  dbgs() << "\n\n";
-}
-
-void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
-  print_types(OS, G);
-  print_details(OS, G);
-}
-
-typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet;
-static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
-                       const SelectionDAG *G, VisitedSDNodeSet &once) {
-  if (!once.insert(N))          // If we've been here before, return now.
-    return;
-
-  // Dump the current SDNode, but don't end the line yet.
-  OS.indent(indent);
-  N->printr(OS, G);
-
-  // Having printed this SDNode, walk the children:
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    const SDNode *child = N->getOperand(i).getNode();
-
-    if (i) OS << ",";
-    OS << " ";
-
-    if (child->getNumOperands() == 0) {
-      // This child has no grandchildren; print it inline right here.
-      child->printr(OS, G);
-      once.insert(child);
-    } else {         // Just the address. FIXME: also print the child's opcode.
-      OS << (void*)child;
-      if (unsigned RN = N->getOperand(i).getResNo())
-        OS << ":" << RN;
-    }
-  }
-
-  OS << "\n";
-
-  // Dump children that have grandchildren on their own line(s).
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    const SDNode *child = N->getOperand(i).getNode();
-    DumpNodesr(OS, child, indent+2, G, once);
-  }
-}
-
-void SDNode::dumpr() const {
-  VisitedSDNodeSet once;
-  DumpNodesr(dbgs(), this, 0, 0, once);
-}
-
-void SDNode::dumpr(const SelectionDAG *G) const {
-  VisitedSDNodeSet once;
-  DumpNodesr(dbgs(), this, 0, G, once);
-}
-
-
 // getAddressSpace - Return the address space this GlobalAddress belongs to.
 unsigned GlobalAddressSDNode::getAddressSpace() const {
   return getGlobal()->getType()->getAddressSpace();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 4e4aa11..2ac9655 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2411,14 +2411,14 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
 
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
-  for (size_t i = 0; i < SI.getNumCases(); ++i) {
-    BasicBlock *SuccBB = SI.getCaseSuccessor(i);
+  for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
+       i != e; ++i) {
+    const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
     uint32_t ExtraWeight = BPI ? BPI->getEdgeWeight(SI.getParent(), SuccBB) : 0;
 
-    Cases.push_back(Case(SI.getCaseValue(i),
-                         SI.getCaseValue(i),
+    Cases.push_back(Case(i.getCaseValue(), i.getCaseValue(),
                          SMBB, ExtraWeight));
   }
   std::sort(Cases.begin(), Cases.end(), CaseCmp());
@@ -4561,8 +4561,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
     MDNode *Variable = DI.getVariable();
     const Value *Address = DI.getAddress();
-    if (!Address || !DIVariable(Variable).Verify())
+    if (!Address || !DIVariable(Variable).Verify()) {
+      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       return 0;
+    }
 
     // Build an entry in DbgOrdering.  Debug info input nodes get an SDNodeOrder
     // but do not always have a corresponding SDNode built.  The SDNodeOrder
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
new file mode 100644
index 0000000..f981afb
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -0,0 +1,631 @@
+//===-- SelectionDAGDumper.cpp - Implement SelectionDAG::dump() -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG::dump method and friends.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+std::string SDNode::getOperationName(const SelectionDAG *G) const {
+  switch (getOpcode()) {
+  default:
+    if (getOpcode() < ISD::BUILTIN_OP_END)
+      return "<<Unknown DAG Node>>";
+    if (isMachineOpcode()) {
+      if (G)
+        if (const TargetInstrInfo *TII = G->getTarget().getInstrInfo())
+          if (getMachineOpcode() < TII->getNumOpcodes())
+            return TII->getName(getMachineOpcode());
+      return "<<Unknown Machine Node #" + utostr(getOpcode()) + ">>";
+    }
+    if (G) {
+      const TargetLowering &TLI = G->getTargetLoweringInfo();
+      const char *Name = TLI.getTargetNodeName(getOpcode());
+      if (Name) return Name;
+      return "<<Unknown Target Node #" + utostr(getOpcode()) + ">>";
+    }
+    return "<<Unknown Node #" + utostr(getOpcode()) + ">>";
+
+#ifndef NDEBUG
+  case ISD::DELETED_NODE:               return "<<Deleted Node!>>";
+#endif
+  case ISD::PREFETCH:                   return "Prefetch";
+  case ISD::MEMBARRIER:                 return "MemBarrier";
+  case ISD::ATOMIC_FENCE:               return "AtomicFence";
+  case ISD::ATOMIC_CMP_SWAP:            return "AtomicCmpSwap";
+  case ISD::ATOMIC_SWAP:                return "AtomicSwap";
+  case ISD::ATOMIC_LOAD_ADD:            return "AtomicLoadAdd";
+  case ISD::ATOMIC_LOAD_SUB:            return "AtomicLoadSub";
+  case ISD::ATOMIC_LOAD_AND:            return "AtomicLoadAnd";
+  case ISD::ATOMIC_LOAD_OR:             return "AtomicLoadOr";
+  case ISD::ATOMIC_LOAD_XOR:            return "AtomicLoadXor";
+  case ISD::ATOMIC_LOAD_NAND:           return "AtomicLoadNand";
+  case ISD::ATOMIC_LOAD_MIN:            return "AtomicLoadMin";
+  case ISD::ATOMIC_LOAD_MAX:            return "AtomicLoadMax";
+  case ISD::ATOMIC_LOAD_UMIN:           return "AtomicLoadUMin";
+  case ISD::ATOMIC_LOAD_UMAX:           return "AtomicLoadUMax";
+  case ISD::ATOMIC_LOAD:                return "AtomicLoad";
+  case ISD::ATOMIC_STORE:               return "AtomicStore";
+  case ISD::PCMARKER:                   return "PCMarker";
+  case ISD::READCYCLECOUNTER:           return "ReadCycleCounter";
+  case ISD::SRCVALUE:                   return "SrcValue";
+  case ISD::MDNODE_SDNODE:              return "MDNode";
+  case ISD::EntryToken:                 return "EntryToken";
+  case ISD::TokenFactor:                return "TokenFactor";
+  case ISD::AssertSext:                 return "AssertSext";
+  case ISD::AssertZext:                 return "AssertZext";
+
+  case ISD::BasicBlock:                 return "BasicBlock";
+  case ISD::VALUETYPE:                  return "ValueType";
+  case ISD::Register:                   return "Register";
+  case ISD::RegisterMask:               return "RegisterMask";
+  case ISD::Constant:                   return "Constant";
+  case ISD::ConstantFP:                 return "ConstantFP";
+  case ISD::GlobalAddress:              return "GlobalAddress";
+  case ISD::GlobalTLSAddress:           return "GlobalTLSAddress";
+  case ISD::FrameIndex:                 return "FrameIndex";
+  case ISD::JumpTable:                  return "JumpTable";
+  case ISD::GLOBAL_OFFSET_TABLE:        return "GLOBAL_OFFSET_TABLE";
+  case ISD::RETURNADDR:                 return "RETURNADDR";
+  case ISD::FRAMEADDR:                  return "FRAMEADDR";
+  case ISD::FRAME_TO_ARGS_OFFSET:       return "FRAME_TO_ARGS_OFFSET";
+  case ISD::EXCEPTIONADDR:              return "EXCEPTIONADDR";
+  case ISD::LSDAADDR:                   return "LSDAADDR";
+  case ISD::EHSELECTION:                return "EHSELECTION";
+  case ISD::EH_RETURN:                  return "EH_RETURN";
+  case ISD::EH_SJLJ_SETJMP:             return "EH_SJLJ_SETJMP";
+  case ISD::EH_SJLJ_LONGJMP:            return "EH_SJLJ_LONGJMP";
+  case ISD::ConstantPool:               return "ConstantPool";
+  case ISD::ExternalSymbol:             return "ExternalSymbol";
+  case ISD::BlockAddress:               return "BlockAddress";
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned OpNo = getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 0 : 1;
+    unsigned IID = cast<ConstantSDNode>(getOperand(OpNo))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return Intrinsic::getName((Intrinsic::ID)IID);
+    else if (const TargetIntrinsicInfo *TII = G->getTarget().getIntrinsicInfo())
+      return TII->getName(IID);
+    llvm_unreachable("Invalid intrinsic ID");
+  }
+
+  case ISD::BUILD_VECTOR:               return "BUILD_VECTOR";
+  case ISD::TargetConstant:             return "TargetConstant";
+  case ISD::TargetConstantFP:           return "TargetConstantFP";
+  case ISD::TargetGlobalAddress:        return "TargetGlobalAddress";
+  case ISD::TargetGlobalTLSAddress:     return "TargetGlobalTLSAddress";
+  case ISD::TargetFrameIndex:           return "TargetFrameIndex";
+  case ISD::TargetJumpTable:            return "TargetJumpTable";
+  case ISD::TargetConstantPool:         return "TargetConstantPool";
+  case ISD::TargetExternalSymbol:       return "TargetExternalSymbol";
+  case ISD::TargetBlockAddress:         return "TargetBlockAddress";
+
+  case ISD::CopyToReg:                  return "CopyToReg";
+  case ISD::CopyFromReg:                return "CopyFromReg";
+  case ISD::UNDEF:                      return "undef";
+  case ISD::MERGE_VALUES:               return "merge_values";
+  case ISD::INLINEASM:                  return "inlineasm";
+  case ISD::EH_LABEL:                   return "eh_label";
+  case ISD::HANDLENODE:                 return "handlenode";
+
+  // Unary operators
+  case ISD::FABS:                       return "fabs";
+  case ISD::FNEG:                       return "fneg";
+  case ISD::FSQRT:                      return "fsqrt";
+  case ISD::FSIN:                       return "fsin";
+  case ISD::FCOS:                       return "fcos";
+  case ISD::FTRUNC:                     return "ftrunc";
+  case ISD::FFLOOR:                     return "ffloor";
+  case ISD::FCEIL:                      return "fceil";
+  case ISD::FRINT:                      return "frint";
+  case ISD::FNEARBYINT:                 return "fnearbyint";
+  case ISD::FEXP:                       return "fexp";
+  case ISD::FEXP2:                      return "fexp2";
+  case ISD::FLOG:                       return "flog";
+  case ISD::FLOG2:                      return "flog2";
+  case ISD::FLOG10:                     return "flog10";
+
+  // Binary operators
+  case ISD::ADD:                        return "add";
+  case ISD::SUB:                        return "sub";
+  case ISD::MUL:                        return "mul";
+  case ISD::MULHU:                      return "mulhu";
+  case ISD::MULHS:                      return "mulhs";
+  case ISD::SDIV:                       return "sdiv";
+  case ISD::UDIV:                       return "udiv";
+  case ISD::SREM:                       return "srem";
+  case ISD::UREM:                       return "urem";
+  case ISD::SMUL_LOHI:                  return "smul_lohi";
+  case ISD::UMUL_LOHI:                  return "umul_lohi";
+  case ISD::SDIVREM:                    return "sdivrem";
+  case ISD::UDIVREM:                    return "udivrem";
+  case ISD::AND:                        return "and";
+  case ISD::OR:                         return "or";
+  case ISD::XOR:                        return "xor";
+  case ISD::SHL:                        return "shl";
+  case ISD::SRA:                        return "sra";
+  case ISD::SRL:                        return "srl";
+  case ISD::ROTL:                       return "rotl";
+  case ISD::ROTR:                       return "rotr";
+  case ISD::FADD:                       return "fadd";
+  case ISD::FSUB:                       return "fsub";
+  case ISD::FMUL:                       return "fmul";
+  case ISD::FDIV:                       return "fdiv";
+  case ISD::FMA:                        return "fma";
+  case ISD::FREM:                       return "frem";
+  case ISD::FCOPYSIGN:                  return "fcopysign";
+  case ISD::FGETSIGN:                   return "fgetsign";
+  case ISD::FPOW:                       return "fpow";
+
+  case ISD::FPOWI:                      return "fpowi";
+  case ISD::SETCC:                      return "setcc";
+  case ISD::SELECT:                     return "select";
+  case ISD::VSELECT:                    return "vselect";
+  case ISD::SELECT_CC:                  return "select_cc";
+  case ISD::INSERT_VECTOR_ELT:          return "insert_vector_elt";
+  case ISD::EXTRACT_VECTOR_ELT:         return "extract_vector_elt";
+  case ISD::CONCAT_VECTORS:             return "concat_vectors";
+  case ISD::INSERT_SUBVECTOR:           return "insert_subvector";
+  case ISD::EXTRACT_SUBVECTOR:          return "extract_subvector";
+  case ISD::SCALAR_TO_VECTOR:           return "scalar_to_vector";
+  case ISD::VECTOR_SHUFFLE:             return "vector_shuffle";
+  case ISD::CARRY_FALSE:                return "carry_false";
+  case ISD::ADDC:                       return "addc";
+  case ISD::ADDE:                       return "adde";
+  case ISD::SADDO:                      return "saddo";
+  case ISD::UADDO:                      return "uaddo";
+  case ISD::SSUBO:                      return "ssubo";
+  case ISD::USUBO:                      return "usubo";
+  case ISD::SMULO:                      return "smulo";
+  case ISD::UMULO:                      return "umulo";
+  case ISD::SUBC:                       return "subc";
+  case ISD::SUBE:                       return "sube";
+  case ISD::SHL_PARTS:                  return "shl_parts";
+  case ISD::SRA_PARTS:                  return "sra_parts";
+  case ISD::SRL_PARTS:                  return "srl_parts";
+
+  // Conversion operators.
+  case ISD::SIGN_EXTEND:                return "sign_extend";
+  case ISD::ZERO_EXTEND:                return "zero_extend";
+  case ISD::ANY_EXTEND:                 return "any_extend";
+  case ISD::SIGN_EXTEND_INREG:          return "sign_extend_inreg";
+  case ISD::TRUNCATE:                   return "truncate";
+  case ISD::FP_ROUND:                   return "fp_round";
+  case ISD::FLT_ROUNDS_:                return "flt_rounds";
+  case ISD::FP_ROUND_INREG:             return "fp_round_inreg";
+  case ISD::FP_EXTEND:                  return "fp_extend";
+
+  case ISD::SINT_TO_FP:                 return "sint_to_fp";
+  case ISD::UINT_TO_FP:                 return "uint_to_fp";
+  case ISD::FP_TO_SINT:                 return "fp_to_sint";
+  case ISD::FP_TO_UINT:                 return "fp_to_uint";
+  case ISD::BITCAST:                    return "bitcast";
+  case ISD::FP16_TO_FP32:               return "fp16_to_fp32";
+  case ISD::FP32_TO_FP16:               return "fp32_to_fp16";
+
+  case ISD::CONVERT_RNDSAT: {
+    switch (cast<CvtRndSatSDNode>(this)->getCvtCode()) {
+    default: llvm_unreachable("Unknown cvt code!");
+    case ISD::CVT_FF:                   return "cvt_ff";
+    case ISD::CVT_FS:                   return "cvt_fs";
+    case ISD::CVT_FU:                   return "cvt_fu";
+    case ISD::CVT_SF:                   return "cvt_sf";
+    case ISD::CVT_UF:                   return "cvt_uf";
+    case ISD::CVT_SS:                   return "cvt_ss";
+    case ISD::CVT_SU:                   return "cvt_su";
+    case ISD::CVT_US:                   return "cvt_us";
+    case ISD::CVT_UU:                   return "cvt_uu";
+    }
+  }
+
+    // Control flow instructions
+  case ISD::BR:                         return "br";
+  case ISD::BRIND:                      return "brind";
+  case ISD::BR_JT:                      return "br_jt";
+  case ISD::BRCOND:                     return "brcond";
+  case ISD::BR_CC:                      return "br_cc";
+  case ISD::CALLSEQ_START:              return "callseq_start";
+  case ISD::CALLSEQ_END:                return "callseq_end";
+
+    // Other operators
+  case ISD::LOAD:                       return "load";
+  case ISD::STORE:                      return "store";
+  case ISD::VAARG:                      return "vaarg";
+  case ISD::VACOPY:                     return "vacopy";
+  case ISD::VAEND:                      return "vaend";
+  case ISD::VASTART:                    return "vastart";
+  case ISD::DYNAMIC_STACKALLOC:         return "dynamic_stackalloc";
+  case ISD::EXTRACT_ELEMENT:            return "extract_element";
+  case ISD::BUILD_PAIR:                 return "build_pair";
+  case ISD::STACKSAVE:                  return "stacksave";
+  case ISD::STACKRESTORE:               return "stackrestore";
+  case ISD::TRAP:                       return "trap";
+
+  // Bit manipulation
+  case ISD::BSWAP:                      return "bswap";
+  case ISD::CTPOP:                      return "ctpop";
+  case ISD::CTTZ:                       return "cttz";
+  case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
+  case ISD::CTLZ:                       return "ctlz";
+  case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
+
+  // Trampolines
+  case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
+  case ISD::ADJUST_TRAMPOLINE:          return "adjust_trampoline";
+
+  case ISD::CONDCODE:
+    switch (cast<CondCodeSDNode>(this)->get()) {
+    default: llvm_unreachable("Unknown setcc condition!");
+    case ISD::SETOEQ:                   return "setoeq";
+    case ISD::SETOGT:                   return "setogt";
+    case ISD::SETOGE:                   return "setoge";
+    case ISD::SETOLT:                   return "setolt";
+    case ISD::SETOLE:                   return "setole";
+    case ISD::SETONE:                   return "setone";
+
+    case ISD::SETO:                     return "seto";
+    case ISD::SETUO:                    return "setuo";
+    case ISD::SETUEQ:                   return "setue";
+    case ISD::SETUGT:                   return "setugt";
+    case ISD::SETUGE:                   return "setuge";
+    case ISD::SETULT:                   return "setult";
+    case ISD::SETULE:                   return "setule";
+    case ISD::SETUNE:                   return "setune";
+
+    case ISD::SETEQ:                    return "seteq";
+    case ISD::SETGT:                    return "setgt";
+    case ISD::SETGE:                    return "setge";
+    case ISD::SETLT:                    return "setlt";
+    case ISD::SETLE:                    return "setle";
+    case ISD::SETNE:                    return "setne";
+
+    case ISD::SETTRUE:                  return "settrue";
+    case ISD::SETTRUE2:                 return "settrue2";
+    case ISD::SETFALSE:                 return "setfalse";
+    case ISD::SETFALSE2:                return "setfalse2";
+    }
+  }
+}
+
+const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
+  switch (AM) {
+  default:              return "";
+  case ISD::PRE_INC:    return "<pre-inc>";
+  case ISD::PRE_DEC:    return "<pre-dec>";
+  case ISD::POST_INC:   return "<post-inc>";
+  case ISD::POST_DEC:   return "<post-dec>";
+  }
+}
+
+void SDNode::dump() const { dump(0); }
+void SDNode::dump(const SelectionDAG *G) const {
+  print(dbgs(), G);
+  dbgs() << '\n';
+}
+
+void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
+  OS << (void*)this << ": ";
+
+  for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
+    if (i) OS << ",";
+    if (getValueType(i) == MVT::Other)
+      OS << "ch";
+    else
+      OS << getValueType(i).getEVTString();
+  }
+  OS << " = " << getOperationName(G);
+}
+
+void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
+  if (const MachineSDNode *MN = dyn_cast<MachineSDNode>(this)) {
+    if (!MN->memoperands_empty()) {
+      OS << "<";
+      OS << "Mem:";
+      for (MachineSDNode::mmo_iterator i = MN->memoperands_begin(),
+           e = MN->memoperands_end(); i != e; ++i) {
+        OS << **i;
+        if (llvm::next(i) != e)
+          OS << " ";
+      }
+      OS << ">";
+    }
+  } else if (const ShuffleVectorSDNode *SVN =
+               dyn_cast<ShuffleVectorSDNode>(this)) {
+    OS << "<";
+    for (unsigned i = 0, e = ValueList[0].getVectorNumElements(); i != e; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (i) OS << ",";
+      if (Idx < 0)
+        OS << "u";
+      else
+        OS << Idx;
+    }
+    OS << ">";
+  } else if (const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(this)) {
+    OS << '<' << CSDN->getAPIntValue() << '>';
+  } else if (const ConstantFPSDNode *CSDN = dyn_cast<ConstantFPSDNode>(this)) {
+    if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEsingle)
+      OS << '<' << CSDN->getValueAPF().convertToFloat() << '>';
+    else if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEdouble)
+      OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';
+    else {
+      OS << "<APFloat(";
+      CSDN->getValueAPF().bitcastToAPInt().dump();
+      OS << ")>";
+    }
+  } else if (const GlobalAddressSDNode *GADN =
+             dyn_cast<GlobalAddressSDNode>(this)) {
+    int64_t offset = GADN->getOffset();
+    OS << '<';
+    WriteAsOperand(OS, GADN->getGlobal());
+    OS << '>';
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = GADN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const FrameIndexSDNode *FIDN = dyn_cast<FrameIndexSDNode>(this)) {
+    OS << "<" << FIDN->getIndex() << ">";
+  } else if (const JumpTableSDNode *JTDN = dyn_cast<JumpTableSDNode>(this)) {
+    OS << "<" << JTDN->getIndex() << ">";
+    if (unsigned int TF = JTDN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(this)){
+    int offset = CP->getOffset();
+    if (CP->isMachineConstantPoolEntry())
+      OS << "<" << *CP->getMachineCPVal() << ">";
+    else
+      OS << "<" << *CP->getConstVal() << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = CP->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(this)) {
+    OS << "<";
+    const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
+    if (LBB)
+      OS << LBB->getName() << " ";
+    OS << (const void*)BBDN->getBasicBlock() << ">";
+  } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
+    OS << ' ' << PrintReg(R->getReg(), G ? G->getTarget().getRegisterInfo() :0);
+  } else if (const ExternalSymbolSDNode *ES =
+             dyn_cast<ExternalSymbolSDNode>(this)) {
+    OS << "'" << ES->getSymbol() << "'";
+    if (unsigned int TF = ES->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const SrcValueSDNode *M = dyn_cast<SrcValueSDNode>(this)) {
+    if (M->getValue())
+      OS << "<" << M->getValue() << ">";
+    else
+      OS << "<null>";
+  } else if (const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(this)) {
+    if (MD->getMD())
+      OS << "<" << MD->getMD() << ">";
+    else
+      OS << "<null>";
+  } else if (const VTSDNode *N = dyn_cast<VTSDNode>(this)) {
+    OS << ":" << N->getVT().getEVTString();
+  }
+  else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(this)) {
+    OS << "<" << *LD->getMemOperand();
+
+    bool doExt = true;
+    switch (LD->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD:  OS << ", anyext"; break;
+    case ISD::SEXTLOAD: OS << ", sext"; break;
+    case ISD::ZEXTLOAD: OS << ", zext"; break;
+    }
+    if (doExt)
+      OS << " from " << LD->getMemoryVT().getEVTString();
+
+    const char *AM = getIndexedModeName(LD->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    OS << ">";
+  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(this)) {
+    OS << "<" << *ST->getMemOperand();
+
+    if (ST->isTruncatingStore())
+      OS << ", trunc to " << ST->getMemoryVT().getEVTString();
+
+    const char *AM = getIndexedModeName(ST->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    OS << ">";
+  } else if (const MemSDNode* M = dyn_cast<MemSDNode>(this)) {
+    OS << "<" << *M->getMemOperand() << ">";
+  } else if (const BlockAddressSDNode *BA =
+               dyn_cast<BlockAddressSDNode>(this)) {
+    OS << "<";
+    WriteAsOperand(OS, BA->getBlockAddress()->getFunction(), false);
+    OS << ", ";
+    WriteAsOperand(OS, BA->getBlockAddress()->getBasicBlock(), false);
+    OS << ">";
+    if (unsigned int TF = BA->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  }
+
+  if (G)
+    if (unsigned Order = G->GetOrdering(this))
+      OS << " [ORD=" << Order << ']';
+
+  if (getNodeId() != -1)
+    OS << " [ID=" << getNodeId() << ']';
+
+  DebugLoc dl = getDebugLoc();
+  if (G && !dl.isUnknown()) {
+    DIScope
+      Scope(dl.getScope(G->getMachineFunction().getFunction()->getContext()));
+    OS << " dbg:";
+    // Omit the directory, since it's usually long and uninteresting.
+    if (Scope.Verify())
+      OS << Scope.getFilename();
+    else
+      OS << "<unknown>";
+    OS << ':' << dl.getLine();
+    if (dl.getCol() != 0)
+      OS << ':' << dl.getCol();
+  }
+}
+
+static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (N->getOperand(i).getNode()->hasOneUse())
+      DumpNodes(N->getOperand(i).getNode(), indent+2, G);
+    else
+      dbgs() << "\n" << std::string(indent+2, ' ')
+             << (void*)N->getOperand(i).getNode() << ": <multiple use>";
+
+  dbgs() << '\n';
+  dbgs().indent(indent);
+  N->dump(G);
+}
+
+void SelectionDAG::dump() const {
+  dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:";
+
+  for (allnodes_const_iterator I = allnodes_begin(), E = allnodes_end();
+       I != E; ++I) {
+    const SDNode *N = I;
+    if (!N->hasOneUse() && N != getRoot().getNode())
+      DumpNodes(N, 2, this);
+  }
+
+  if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
+  dbgs() << "\n\n";
+}
+
+void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
+  print_types(OS, G);
+  print_details(OS, G);
+}
+
+typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet;
+static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
+                       const SelectionDAG *G, VisitedSDNodeSet &once) {
+  if (!once.insert(N))          // If we've been here before, return now.
+    return;
+
+  // Dump the current SDNode, but don't end the line yet.
+  OS.indent(indent);
+  N->printr(OS, G);
+
+  // Having printed this SDNode, walk the children:
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDNode *child = N->getOperand(i).getNode();
+
+    if (i) OS << ",";
+    OS << " ";
+
+    if (child->getNumOperands() == 0) {
+      // This child has no grandchildren; print it inline right here.
+      child->printr(OS, G);
+      once.insert(child);
+    } else {         // Just the address. FIXME: also print the child's opcode.
+      OS << (void*)child;
+      if (unsigned RN = N->getOperand(i).getResNo())
+        OS << ":" << RN;
+    }
+  }
+
+  OS << "\n";
+
+  // Dump children that have grandchildren on their own line(s).
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDNode *child = N->getOperand(i).getNode();
+    DumpNodesr(OS, child, indent+2, G, once);
+  }
+}
+
+void SDNode::dumpr() const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, 0, once);
+}
+
+void SDNode::dumpr(const SelectionDAG *G) const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, G, once);
+}
+
+static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
+                                  const SelectionDAG *G, unsigned depth,
+                                  unsigned indent) {
+  if (depth == 0)
+    return;
+
+  OS.indent(indent);
+
+  N->print(OS, G);
+
+  if (depth < 1)
+    return;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    // Don't follow chain operands.
+    if (N->getOperand(i).getValueType() == MVT::Other)
+      continue;
+    OS << '\n';
+    printrWithDepthHelper(OS, N->getOperand(i).getNode(), G, depth-1, indent+2);
+  }
+}
+
+void SDNode::printrWithDepth(raw_ostream &OS, const SelectionDAG *G,
+                            unsigned depth) const {
+  printrWithDepthHelper(OS, this, G, depth, 0);
+}
+
+void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  printrWithDepth(OS, G, 10);
+}
+
+void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const {
+  printrWithDepth(dbgs(), G, depth);
+}
+
+void SDNode::dumprFull(const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  dumprWithDepth(G, 10);
+}
+
+void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
+  print_types(OS, G);
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    if (i) OS << ", "; else OS << " ";
+    OS << (void*)getOperand(i).getNode();
+    if (unsigned RN = getOperand(i).getResNo())
+      OS << ":" << RN;
+  }
+  print_details(OS, G);
+}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 2173d8d..8aabc02 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -673,7 +673,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   {
     NamedRegionTimer T("Instruction Scheduling", GroupName,
                        TimePassesIsEnabled);
-    Scheduler->Run(CurDAG, FuncInfo->MBB, FuncInfo->InsertPt);
+    Scheduler->Run(CurDAG, FuncInfo->MBB);
   }
 
   if (ViewSUnitDAGs) Scheduler->viewGraph();
@@ -684,8 +684,9 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   {
     NamedRegionTimer T("Instruction Creation", GroupName, TimePassesIsEnabled);
 
-    LastMBB = FuncInfo->MBB = Scheduler->EmitSchedule();
-    FuncInfo->InsertPt = Scheduler->InsertPos;
+    // FuncInfo->InsertPt is passed by reference and set to the end of the
+    // scheduled instructions.
+    LastMBB = FuncInfo->MBB = Scheduler->EmitSchedule(FuncInfo->InsertPt);
   }
 
   // If the block was split, make sure we update any references that are used to
@@ -774,7 +775,7 @@ void SelectionDAGISel::PrepareEHLandingPad() {
 
   // Assign the call site to the landing pad's begin label.
   MF->getMMI().setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
-    
+
   const MCInstrDesc &II = TM.getInstrInfo()->get(TargetOpcode::EH_LABEL);
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
@@ -934,9 +935,9 @@ static void collectFailStats(const Instruction *I) {
   case Instruction::FPToSI:   NumFastIselFailFPToSI++; return;
   case Instruction::UIToFP:   NumFastIselFailUIToFP++; return;
   case Instruction::SIToFP:   NumFastIselFailSIToFP++; return;
-  case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return; 
+  case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return;
   case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return;
-  case Instruction::BitCast:  NumFastIselFailBitCast++; return; 
+  case Instruction::BitCast:  NumFastIselFailBitCast++; return;
 
   // Other instructions...
   case Instruction::ICmp:           NumFastIselFailICmp++; return;
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 5412c97..9a86f32 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -1,4 +1,4 @@
-//===- SjLjEHPass.cpp - Eliminate Invoke & Unwind instructions -----------===//
+//===- SjLjEHPrepare.cpp - Eliminate Invoke & Unwind instructions ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -42,7 +42,7 @@ STATISTIC(NumInvokes, "Number of invokes replaced");
 STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
 namespace {
-  class SjLjEHPass : public FunctionPass {
+  class SjLjEHPrepare : public FunctionPass {
     const TargetLowering *TLI;
     Type *FunctionContextTy;
     Constant *RegisterFn;
@@ -58,7 +58,7 @@ namespace {
     AllocaInst *FuncCtx;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit SjLjEHPass(const TargetLowering *tli = NULL)
+    explicit SjLjEHPrepare(const TargetLowering *tli = NULL)
       : FunctionPass(ID), TLI(tli) { }
     bool doInitialization(Module &M);
     bool runOnFunction(Function &F);
@@ -79,15 +79,15 @@ namespace {
   };
 } // end anonymous namespace
 
-char SjLjEHPass::ID = 0;
+char SjLjEHPrepare::ID = 0;
 
-// Public Interface To the SjLjEHPass pass.
-FunctionPass *llvm::createSjLjEHPass(const TargetLowering *TLI) {
-  return new SjLjEHPass(TLI);
+// Public Interface To the SjLjEHPrepare pass.
+FunctionPass *llvm::createSjLjEHPreparePass(const TargetLowering *TLI) {
+  return new SjLjEHPrepare(TLI);
 }
 // doInitialization - Set up decalarations and types needed to process
 // exceptions.
-bool SjLjEHPass::doInitialization(Module &M) {
+bool SjLjEHPrepare::doInitialization(Module &M) {
   // Build the function context structure.
   // builtin_setjmp uses a five word jbuf
   Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
@@ -123,7 +123,7 @@ bool SjLjEHPass::doInitialization(Module &M) {
 
 /// insertCallSiteStore - Insert a store of the call-site value to the
 /// function context
-void SjLjEHPass::insertCallSiteStore(Instruction *I, int Number) {
+void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
   IRBuilder<> Builder(I);
 
   // Get a reference to the call_site field.
@@ -151,8 +151,8 @@ static void MarkBlocksLiveIn(BasicBlock *BB,
 
 /// substituteLPadValues - Substitute the values returned by the landingpad
 /// instruction with those returned by the personality function.
-void SjLjEHPass::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
-                                      Value *SelVal) {
+void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
+                                         Value *SelVal) {
   SmallVector<Value*, 8> UseWorkList(LPI->use_begin(), LPI->use_end());
   while (!UseWorkList.empty()) {
     Value *Val = UseWorkList.pop_back_val();
@@ -183,7 +183,7 @@ void SjLjEHPass::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
 
 /// setupFunctionContext - Allocate the function context on the stack and fill
 /// it with all of the data that we know at this point.
-Value *SjLjEHPass::
+Value *SjLjEHPrepare::
 setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
   BasicBlock *EntryBB = F.begin();
 
@@ -251,7 +251,7 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
 /// specially, we lower each arg to a copy instruction in the entry block. This
 /// ensures that the argument value itself cannot be live out of the entry
 /// block.
-void SjLjEHPass::lowerIncomingArguments(Function &F) {
+void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
   BasicBlock::iterator AfterAllocaInsPt = F.begin()->begin();
   while (isa<AllocaInst>(AfterAllocaInsPt) &&
          isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsPt)->getArraySize()))
@@ -295,8 +295,8 @@ void SjLjEHPass::lowerIncomingArguments(Function &F) {
 
 /// lowerAcrossUnwindEdges - Find all variables which are alive across an unwind
 /// edge and spill them.
-void SjLjEHPass::lowerAcrossUnwindEdges(Function &F,
-                                        ArrayRef<InvokeInst*> Invokes) {
+void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
+                                           ArrayRef<InvokeInst*> Invokes) {
   // Finally, scan the code looking for instructions with bad live ranges.
   for (Function::iterator
          BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
@@ -393,7 +393,7 @@ void SjLjEHPass::lowerAcrossUnwindEdges(Function &F,
 /// setupEntryBlockAndCallSites - Setup the entry block by creating and filling
 /// the function context and marking the call sites with the appropriate
 /// values. These values are used by the DWARF EH emitter.
-bool SjLjEHPass::setupEntryBlockAndCallSites(Function &F) {
+bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   SmallVector<ReturnInst*,     16> Returns;
   SmallVector<InvokeInst*,     16> Invokes;
   SmallSetVector<LandingPadInst*, 16> LPads;
@@ -519,7 +519,7 @@ bool SjLjEHPass::setupEntryBlockAndCallSites(Function &F) {
   return true;
 }
 
-bool SjLjEHPass::runOnFunction(Function &F) {
+bool SjLjEHPrepare::runOnFunction(Function &F) {
   bool Res = setupEntryBlockAndCallSites(F);
   return Res;
 }
diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
index 58caae8..cb11bfe 100644
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt
@@ -1,3 +1,5 @@
+
+
 add_llvm_library(LLVMExecutionEngine
   ExecutionEngine.cpp
   ExecutionEngineBindings.cpp
@@ -8,3 +10,11 @@ add_subdirectory(Interpreter)
 add_subdirectory(JIT)
 add_subdirectory(MCJIT)
 add_subdirectory(RuntimeDyld)
+
+if( LLVM_USE_OPROFILE )
+  add_subdirectory(OProfileJIT)
+endif( LLVM_USE_OPROFILE )
+
+if( LLVM_USE_INTEL_JITEVENTS )
+  add_subdirectory(IntelJITEvents)
+endif( LLVM_USE_INTEL_JITEVENTS )
diff --git a/lib/ExecutionEngine/EventListenerCommon.h b/lib/ExecutionEngine/EventListenerCommon.h
new file mode 100644
index 0000000..1c07c94
--- /dev/null
+++ b/lib/ExecutionEngine/EventListenerCommon.h
@@ -0,0 +1,67 @@
+//===-- JIT.h - Abstract Execution Engine Interface -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common functionality for JITEventListener implementations
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef EVENT_LISTENER_COMMON_H
+#define EVENT_LISTENER_COMMON_H
+
+#include "llvm/Metadata.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/Path.h"
+
+namespace llvm {
+
+namespace jitprofiling {
+
+class FilenameCache {
+  // Holds the filename of each Scope, so that we can pass a null-terminated
+  // string into oprofile.  Use an AssertingVH rather than a ValueMap because we
+  // shouldn't be modifying any MDNodes while this map is alive.
+  DenseMap<AssertingVH<MDNode>, std::string> Filenames;
+  DenseMap<AssertingVH<MDNode>, std::string> Paths;
+
+ public:
+  const char *getFilename(MDNode *Scope) {
+    std::string &Filename = Filenames[Scope];
+    if (Filename.empty()) {
+      DIScope DIScope(Scope);
+      Filename = DIScope.getFilename();
+    }
+    return Filename.c_str();
+  }
+
+  const char *getFullPath(MDNode *Scope) {
+    std::string &P = Paths[Scope];
+    if (P.empty()) {
+      DIScope DIScope(Scope);
+      StringRef DirName = DIScope.getDirectory();
+      StringRef FileName = DIScope.getFilename();
+      SmallString<256> FullPath;
+      if (DirName != "." && DirName != "") {
+        FullPath = DirName;
+      }
+      if (FileName != "") {
+        sys::path::append(FullPath, FileName);
+      }
+      P = FullPath.str();
+    }
+    return P.c_str();
+  }
+};
+
+} // namespace jitprofiling
+
+} // namespace llvm
+
+#endif //EVENT_LISTENER_COMMON_H
diff --git a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
new file mode 100644
index 0000000..7d67d0d
--- /dev/null
+++ b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
@@ -0,0 +1,11 @@
+
+include_directories( ${LLVM_INTEL_JITEVENTS_INCDIR} ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+set(system_libs
+  ${system_libs}
+  jitprofiling
+  )
+
+add_llvm_library(LLVMIntelJITEvents
+  IntelJITEventListener.cpp
+  )
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
new file mode 100644
index 0000000..5dfa78f
--- /dev/null
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -0,0 +1,183 @@
+//===-- IntelJITEventListener.cpp - Tell Intel profiler about JITed code --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a JITEventListener object to tell Intel(R) VTune(TM)
+// Amplifier XE 2011 about JITted functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+
+#define DEBUG_TYPE "amplifier-jit-event-listener"
+#include "llvm/Function.h"
+#include "llvm/Metadata.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ExecutionEngine/IntelJITEventsWrapper.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Errno.h"
+#include "llvm/Support/ValueHandle.h"
+#include "EventListenerCommon.h"
+
+using namespace llvm;
+using namespace llvm::jitprofiling;
+
+namespace {
+
+class IntelJITEventListener : public JITEventListener {
+  typedef DenseMap<void*, unsigned int> MethodIDMap;
+
+  IntelJITEventsWrapper& Wrapper;
+  MethodIDMap MethodIDs;
+  FilenameCache Filenames;
+
+public:
+  IntelJITEventListener(IntelJITEventsWrapper& libraryWrapper)
+  : Wrapper(libraryWrapper) {
+  }
+
+  ~IntelJITEventListener() {
+  }
+
+  virtual void NotifyFunctionEmitted(const Function &F,
+                                     void *FnStart, size_t FnSize,
+                                     const EmittedFunctionDetails &Details);
+
+  virtual void NotifyFreeingMachineCode(void *OldPtr);
+};
+
+static LineNumberInfo LineStartToIntelJITFormat(
+    uintptr_t StartAddress,
+    uintptr_t Address,
+    DebugLoc Loc) {
+  LineNumberInfo Result;
+
+  Result.Offset = Address - StartAddress;
+  Result.LineNumber = Loc.getLine();
+
+  return Result;
+}
+
+static iJIT_Method_Load FunctionDescToIntelJITFormat(
+    IntelJITEventsWrapper& Wrapper,
+    const char* FnName,
+    uintptr_t FnStart,
+    size_t FnSize) {
+  iJIT_Method_Load Result;
+  memset(&Result, 0, sizeof(iJIT_Method_Load));
+
+  Result.method_id = Wrapper.iJIT_GetNewMethodID();
+  Result.method_name = const_cast<char*>(FnName);
+  Result.method_load_address = reinterpret_cast<void*>(FnStart);
+  Result.method_size = FnSize;
+
+  Result.class_id = 0;
+  Result.class_file_name = NULL;
+  Result.user_data = NULL;
+  Result.user_data_size = 0;
+  Result.env = iJDE_JittingAPI;
+
+  return Result;
+}
+
+// Adds the just-emitted function to the symbol table.
+void IntelJITEventListener::NotifyFunctionEmitted(
+    const Function &F, void *FnStart, size_t FnSize,
+    const EmittedFunctionDetails &Details) {
+  iJIT_Method_Load FunctionMessage = FunctionDescToIntelJITFormat(Wrapper,
+                                      F.getName().data(),
+                                      reinterpret_cast<uint64_t>(FnStart),
+                                      FnSize);
+
+  std::vector<LineNumberInfo> LineInfo;
+
+  if (!Details.LineStarts.empty()) {
+    // Now convert the line number information from the address/DebugLoc
+    // format in Details to the offset/lineno in Intel JIT API format.
+
+    LineInfo.reserve(Details.LineStarts.size() + 1);
+
+    DebugLoc FirstLoc = Details.LineStarts[0].Loc;
+    assert(!FirstLoc.isUnknown()
+           && "LineStarts should not contain unknown DebugLocs");
+
+    MDNode *FirstLocScope = FirstLoc.getScope(F.getContext());
+    DISubprogram FunctionDI = getDISubprogram(FirstLocScope);
+    if (FunctionDI.Verify()) {
+      FunctionMessage.source_file_name = const_cast<char*>(
+                                          Filenames.getFullPath(FirstLocScope));
+
+      LineNumberInfo FirstLine;
+      FirstLine.Offset = 0;
+      FirstLine.LineNumber = FunctionDI.getLineNumber();
+      LineInfo.push_back(FirstLine);
+    }
+
+    for (std::vector<EmittedFunctionDetails::LineStart>::const_iterator I =
+          Details.LineStarts.begin(), E = Details.LineStarts.end();
+          I != E; ++I) {
+      // This implementation ignores the DebugLoc filename because the Intel
+      // JIT API does not support multiple source files associated with a single
+      // JIT function
+      LineInfo.push_back(LineStartToIntelJITFormat(
+                          reinterpret_cast<uintptr_t>(FnStart),
+                          I->Address,
+                          I->Loc));
+
+      // If we have no file name yet for the function, use the filename from
+      // the first instruction that has one
+      if (FunctionMessage.source_file_name == 0) {
+        MDNode *scope = I->Loc.getScope(
+					Details.MF->getFunction()->getContext());
+        FunctionMessage.source_file_name = const_cast<char*>(
+                                                  Filenames.getFullPath(scope));
+      }
+    }
+
+    FunctionMessage.line_number_size = LineInfo.size();
+    FunctionMessage.line_number_table = &*LineInfo.begin();
+  } else {
+    FunctionMessage.line_number_size = 0;
+    FunctionMessage.line_number_table = 0;
+  }
+
+  Wrapper.iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
+													 &FunctionMessage);
+  MethodIDs[FnStart] = FunctionMessage.method_id;
+}
+
+void IntelJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
+  MethodIDMap::iterator I = MethodIDs.find(FnStart);
+  if (I != MethodIDs.end()) {
+    Wrapper.iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_UNLOAD_START, &I->second);
+    MethodIDs.erase(I);
+  }
+}
+
+}  // anonymous namespace.
+
+namespace llvm {
+JITEventListener *JITEventListener::createIntelJITEventListener() {
+  static OwningPtr<IntelJITEventsWrapper> JITProfilingWrapper(
+                                            new IntelJITEventsWrapper);
+  return new IntelJITEventListener(*JITProfilingWrapper);
+}
+
+// for testing
+JITEventListener *JITEventListener::createIntelJITEventListener(
+                                      IntelJITEventsWrapper* TestImpl) {
+  return new IntelJITEventListener(*TestImpl);
+}
+
+} // namespace llvm
+
diff --git a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
new file mode 100644
index 0000000..80d2273
--- /dev/null
+++ b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/ExecutionEngine/JITProfileAmplifier/LLVMBuild.txt --*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+
+[component_0]
+type = Library
+name = IntelJITEvents
+parent = ExecutionEngine
diff --git a/lib/ExecutionEngine/IntelJITEvents/Makefile b/lib/ExecutionEngine/IntelJITEvents/Makefile
new file mode 100644
index 0000000..ba75ac6
--- /dev/null
+++ b/lib/ExecutionEngine/IntelJITEvents/Makefile
@@ -0,0 +1,17 @@
+##===- lib/ExecutionEngine/JITProfile/Makefile -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMIntelJITEvents
+
+include $(LEVEL)/Makefile.config
+
+SOURCES := IntelJITEventListener.cpp
+CPPFLAGS += -I$(INTEL_JITEVENTS_INCDIR) -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index 3dce3b3..af47be9 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -650,12 +650,10 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
 
   // Check to see if any of the cases match...
   BasicBlock *Dest = 0;
-  unsigned NumCases = I.getNumCases();
-  // Skip the first item since that's the default case.
-  for (unsigned i = 0; i < NumCases; ++i) {
-    GenericValue CaseVal = getOperandValue(I.getCaseValue(i), SF);
+  for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
+    GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
     if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
-      Dest = cast<BasicBlock>(I.getCaseSuccessor(i));
+      Dest = cast<BasicBlock>(i.getCaseSuccessor());
       break;
     }
   }
diff --git a/lib/ExecutionEngine/JIT/CMakeLists.txt b/lib/ExecutionEngine/JIT/CMakeLists.txt
index dcef08c..52bb389 100644
--- a/lib/ExecutionEngine/JIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/JIT/CMakeLists.txt
@@ -2,10 +2,8 @@
 add_definitions(-DENABLE_X86_JIT)
 
 add_llvm_library(LLVMJIT
-  Intercept.cpp
   JIT.cpp
   JITDwarfEmitter.cpp
   JITEmitter.cpp
   JITMemoryManager.cpp
-  OProfileJITEventListener.cpp
   )
diff --git a/lib/ExecutionEngine/JIT/Intercept.cpp b/lib/ExecutionEngine/JIT/Intercept.cpp
deleted file mode 100644
index 2251a8e..0000000
--- a/lib/ExecutionEngine/JIT/Intercept.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//===-- Intercept.cpp - System function interception routines -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// If a function call occurs to an external function, the JIT is designed to use
-// the dynamic loader interface to find a function to call.  This is useful for
-// calling system calls and library functions that are not available in LLVM.
-// Some system calls, however, need to be handled specially.  For this reason,
-// we intercept some of them here and use our own stubs to handle them.
-//
-//===----------------------------------------------------------------------===//
-
-#include "JIT.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Config/config.h"
-using namespace llvm;
-
-// AtExitHandlers - List of functions to call when the program exits,
-// registered with the atexit() library function.
-static std::vector<void (*)()> AtExitHandlers;
-
-/// runAtExitHandlers - Run any functions registered by the program's
-/// calls to atexit(3), which we intercept and store in
-/// AtExitHandlers.
-///
-static void runAtExitHandlers() {
-  while (!AtExitHandlers.empty()) {
-    void (*Fn)() = AtExitHandlers.back();
-    AtExitHandlers.pop_back();
-    Fn();
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Function stubs that are invoked instead of certain library calls
-//===----------------------------------------------------------------------===//
-
-// Force the following functions to be linked in to anything that uses the
-// JIT. This is a hack designed to work around the all-too-clever Glibc
-// strategy of making these functions work differently when inlined vs. when
-// not inlined, and hiding their real definitions in a separate archive file
-// that the dynamic linker can't see. For more info, search for
-// 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-#if defined(__linux__)
-#if defined(HAVE_SYS_STAT_H)
-#include <sys/stat.h>
-#endif
-#include <fcntl.h>
-#include <unistd.h>
-/* stat functions are redirecting to __xstat with a version number.  On x86-64
- * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
- * available as an exported symbol, so we have to add it explicitly.
- */
-namespace {
-class StatSymbols {
-public:
-  StatSymbols() {
-    sys::DynamicLibrary::AddSymbol("stat", (void*)(intptr_t)stat);
-    sys::DynamicLibrary::AddSymbol("fstat", (void*)(intptr_t)fstat);
-    sys::DynamicLibrary::AddSymbol("lstat", (void*)(intptr_t)lstat);
-    sys::DynamicLibrary::AddSymbol("stat64", (void*)(intptr_t)stat64);
-    sys::DynamicLibrary::AddSymbol("\x1stat64", (void*)(intptr_t)stat64);
-    sys::DynamicLibrary::AddSymbol("\x1open64", (void*)(intptr_t)open64);
-    sys::DynamicLibrary::AddSymbol("\x1lseek64", (void*)(intptr_t)lseek64);
-    sys::DynamicLibrary::AddSymbol("fstat64", (void*)(intptr_t)fstat64);
-    sys::DynamicLibrary::AddSymbol("lstat64", (void*)(intptr_t)lstat64);
-    sys::DynamicLibrary::AddSymbol("atexit", (void*)(intptr_t)atexit);
-    sys::DynamicLibrary::AddSymbol("mknod", (void*)(intptr_t)mknod);
-  }
-};
-}
-static StatSymbols initStatSymbols;
-#endif // __linux__
-
-// jit_exit - Used to intercept the "exit" library call.
-static void jit_exit(int Status) {
-  runAtExitHandlers();   // Run atexit handlers...
-  exit(Status);
-}
-
-// jit_atexit - Used to intercept the "atexit" library call.
-static int jit_atexit(void (*Fn)()) {
-  AtExitHandlers.push_back(Fn);    // Take note of atexit handler...
-  return 0;  // Always successful
-}
-
-static int jit_noop() {
-  return 0;
-}
-
-//===----------------------------------------------------------------------===//
-//
-/// getPointerToNamedFunction - This method returns the address of the specified
-/// function by using the dynamic loader interface.  As such it is only useful
-/// for resolving library symbols, not code generated symbols.
-///
-void *JIT::getPointerToNamedFunction(const std::string &Name,
-                                     bool AbortOnFailure) {
-  if (!isSymbolSearchingDisabled()) {
-    // Check to see if this is one of the functions we want to intercept.  Note,
-    // we cast to intptr_t here to silence a -pedantic warning that complains
-    // about casting a function pointer to a normal pointer.
-    if (Name == "exit") return (void*)(intptr_t)&jit_exit;
-    if (Name == "atexit") return (void*)(intptr_t)&jit_atexit;
-
-    // We should not invoke parent's ctors/dtors from generated main()!
-    // On Mingw and Cygwin, the symbol __main is resolved to
-    // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
-    // (and register wrong callee's dtors with atexit(3)).
-    // We expect ExecutionEngine::runStaticConstructorsDestructors()
-    // is called before ExecutionEngine::runFunctionAsMain() is called.
-    if (Name == "__main") return (void*)(intptr_t)&jit_noop;
-
-    const char *NameStr = Name.c_str();
-    // If this is an asm specifier, skip the sentinal.
-    if (NameStr[0] == 1) ++NameStr;
-
-    // If it's an external function, look it up in the process image...
-    void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
-    if (Ptr) return Ptr;
-
-    // If it wasn't found and if it starts with an underscore ('_') character,
-    // and has an asm specifier, try again without the underscore.
-    if (Name[0] == 1 && NameStr[0] == '_') {
-      Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
-      if (Ptr) return Ptr;
-    }
-
-    // Darwin/PPC adds $LDBLStub suffixes to various symbols like printf.  These
-    // are references to hidden visibility symbols that dlsym cannot resolve.
-    // If we have one of these, strip off $LDBLStub and try again.
-#if defined(__APPLE__) && defined(__ppc__)
-    if (Name.size() > 9 && Name[Name.size()-9] == '$' &&
-        memcmp(&Name[Name.size()-8], "LDBLStub", 8) == 0) {
-      // First try turning $LDBLStub into $LDBL128. If that fails, strip it off.
-      // This mirrors logic in libSystemStubs.a.
-      std::string Prefix = std::string(Name.begin(), Name.end()-9);
-      if (void *Ptr = getPointerToNamedFunction(Prefix+"$LDBL128", false))
-        return Ptr;
-      if (void *Ptr = getPointerToNamedFunction(Prefix, false))
-        return Ptr;
-    }
-#endif
-  }
-
-  /// If a LazyFunctionCreator is installed, use it to get/create the function.
-  if (LazyFunctionCreator)
-    if (void *RP = LazyFunctionCreator(Name))
-      return RP;
-
-  if (AbortOnFailure) {
-    report_fatal_error("Program used external function '"+Name+
-                      "' which could not be resolved!");
-  }
-  return 0;
-}
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index f715f6f..16b8ee2 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineCodeInfo.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetJITInfo.h"
@@ -267,9 +268,9 @@ extern "C" {
 }
 
 JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
-         JITMemoryManager *JMM, bool GVsWithCode)
-  : ExecutionEngine(M), TM(tm), TJI(tji), AllocateGVsWithCode(GVsWithCode),
-    isAlreadyCodeGenerating(false) {
+         JITMemoryManager *jmm, bool GVsWithCode)
+  : ExecutionEngine(M), TM(tm), TJI(tji), JMM(jmm),
+    AllocateGVsWithCode(GVsWithCode), isAlreadyCodeGenerating(false) {
   setTargetData(TM.getTargetData());
 
   jitstate = new JITState(M);
@@ -711,6 +712,27 @@ void *JIT::getPointerToBasicBlock(BasicBlock *BB) {
   }
 }
 
+void *JIT::getPointerToNamedFunction(const std::string &Name,
+                                     bool AbortOnFailure){
+  if (!isSymbolSearchingDisabled()) {
+    void *ptr = JMM->getPointerToNamedFunction(Name, false);
+    if (ptr)
+      return ptr;
+  }
+
+  /// If a LazyFunctionCreator is installed, use it to get/create the function.
+  if (LazyFunctionCreator)
+    if (void *RP = LazyFunctionCreator(Name))
+      return RP;
+
+  if (AbortOnFailure) {
+    report_fatal_error("Program used external function '"+Name+
+                      "' which could not be resolved!");
+  }
+  return 0;
+}
+
+
 /// getOrEmitGlobalVariable - Return the address of the specified global
 /// variable, possibly emitting it to memory if needed.  This is used by the
 /// Emitter.
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index 17d33fe..c557981 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -58,6 +58,7 @@ class JIT : public ExecutionEngine {
   TargetMachine &TM;       // The current target we are compiling to
   TargetJITInfo &TJI;      // The JITInfo for the target we are compiling to
   JITCodeEmitter *JCE;     // JCE object
+  JITMemoryManager *JMM;
   std::vector<JITEventListener*> EventListeners;
 
   /// AllocateGVsWithCode - Some applications require that global variables and
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index efd570d..d404d0c 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -314,6 +314,17 @@ namespace {
     /// should allocate a separate slab.
     static const size_t DefaultSizeThreshold;
 
+    /// getPointerToNamedFunction - This method returns the address of the
+    /// specified function by using the dlsym function call.  As such it is only
+    /// useful for resolving library symbols, not code generated symbols.
+    ///
+    /// If AbortOnFailure is false and no function with the given name is
+    /// found, this function silently returns a null pointer. Otherwise,
+    /// it prints a message to stderr and aborts.
+    ///
+    virtual void *getPointerToNamedFunction(const std::string &Name,
+                                            bool AbortOnFailure = true);
+
     void AllocateGOT();
 
     // Testing methods.
@@ -757,6 +768,148 @@ bool DefaultJITMemoryManager::CheckInvariants(std::string &ErrorStr) {
   return true;
 }
 
+//===----------------------------------------------------------------------===//
+// getPointerToNamedFunction() implementation.
+//===----------------------------------------------------------------------===//
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Config/config.h"
+
+// AtExitHandlers - List of functions to call when the program exits,
+// registered with the atexit() library function.
+static std::vector<void (*)()> AtExitHandlers;
+
+/// runAtExitHandlers - Run any functions registered by the program's
+/// calls to atexit(3), which we intercept and store in
+/// AtExitHandlers.
+///
+static void runAtExitHandlers() {
+  while (!AtExitHandlers.empty()) {
+    void (*Fn)() = AtExitHandlers.back();
+    AtExitHandlers.pop_back();
+    Fn();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Function stubs that are invoked instead of certain library calls
+//===----------------------------------------------------------------------===//
+
+// Force the following functions to be linked in to anything that uses the
+// JIT. This is a hack designed to work around the all-too-clever Glibc
+// strategy of making these functions work differently when inlined vs. when
+// not inlined, and hiding their real definitions in a separate archive file
+// that the dynamic linker can't see. For more info, search for
+// 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+#if defined(__linux__)
+#if defined(HAVE_SYS_STAT_H)
+#include <sys/stat.h>
+#endif
+#include <fcntl.h>
+#include <unistd.h>
+/* stat functions are redirecting to __xstat with a version number.  On x86-64
+ * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
+ * available as an exported symbol, so we have to add it explicitly.
+ */
+namespace {
+class StatSymbols {
+public:
+  StatSymbols() {
+    sys::DynamicLibrary::AddSymbol("stat", (void*)(intptr_t)stat);
+    sys::DynamicLibrary::AddSymbol("fstat", (void*)(intptr_t)fstat);
+    sys::DynamicLibrary::AddSymbol("lstat", (void*)(intptr_t)lstat);
+    sys::DynamicLibrary::AddSymbol("stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1open64", (void*)(intptr_t)open64);
+    sys::DynamicLibrary::AddSymbol("\x1lseek64", (void*)(intptr_t)lseek64);
+    sys::DynamicLibrary::AddSymbol("fstat64", (void*)(intptr_t)fstat64);
+    sys::DynamicLibrary::AddSymbol("lstat64", (void*)(intptr_t)lstat64);
+    sys::DynamicLibrary::AddSymbol("atexit", (void*)(intptr_t)atexit);
+    sys::DynamicLibrary::AddSymbol("mknod", (void*)(intptr_t)mknod);
+  }
+};
+}
+static StatSymbols initStatSymbols;
+#endif // __linux__
+
+// jit_exit - Used to intercept the "exit" library call.
+static void jit_exit(int Status) {
+  runAtExitHandlers();   // Run atexit handlers...
+  exit(Status);
+}
+
+// jit_atexit - Used to intercept the "atexit" library call.
+static int jit_atexit(void (*Fn)()) {
+  AtExitHandlers.push_back(Fn);    // Take note of atexit handler...
+  return 0;  // Always successful
+}
+
+static int jit_noop() {
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+/// getPointerToNamedFunction - This method returns the address of the specified
+/// function by using the dynamic loader interface.  As such it is only useful
+/// for resolving library symbols, not code generated symbols.
+///
+void *DefaultJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                     bool AbortOnFailure) {
+  // Check to see if this is one of the functions we want to intercept.  Note,
+  // we cast to intptr_t here to silence a -pedantic warning that complains
+  // about casting a function pointer to a normal pointer.
+  if (Name == "exit") return (void*)(intptr_t)&jit_exit;
+  if (Name == "atexit") return (void*)(intptr_t)&jit_atexit;
+
+  // We should not invoke parent's ctors/dtors from generated main()!
+  // On Mingw and Cygwin, the symbol __main is resolved to
+  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
+  // (and register wrong callee's dtors with atexit(3)).
+  // We expect ExecutionEngine::runStaticConstructorsDestructors()
+  // is called before ExecutionEngine::runFunctionAsMain() is called.
+  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
+
+  const char *NameStr = Name.c_str();
+  // If this is an asm specifier, skip the sentinal.
+  if (NameStr[0] == 1) ++NameStr;
+
+  // If it's an external function, look it up in the process image...
+  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+  if (Ptr) return Ptr;
+
+  // If it wasn't found and if it starts with an underscore ('_') character,
+  // try again without the underscore.
+  if (NameStr[0] == '_') {
+    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
+    if (Ptr) return Ptr;
+  }
+
+  // Darwin/PPC adds $LDBLStub suffixes to various symbols like printf.  These
+  // are references to hidden visibility symbols that dlsym cannot resolve.
+  // If we have one of these, strip off $LDBLStub and try again.
+#if defined(__APPLE__) && defined(__ppc__)
+  if (Name.size() > 9 && Name[Name.size()-9] == '$' &&
+      memcmp(&Name[Name.size()-8], "LDBLStub", 8) == 0) {
+    // First try turning $LDBLStub into $LDBL128. If that fails, strip it off.
+    // This mirrors logic in libSystemStubs.a.
+    std::string Prefix = std::string(Name.begin(), Name.end()-9);
+    if (void *Ptr = getPointerToNamedFunction(Prefix+"$LDBL128", false))
+      return Ptr;
+    if (void *Ptr = getPointerToNamedFunction(Prefix, false))
+      return Ptr;
+  }
+#endif
+
+  if (AbortOnFailure) {
+    report_fatal_error("Program used external function '"+Name+
+                      "' which could not be resolved!");
+  }
+  return 0;
+}
+
+
+
 JITMemoryManager *JITMemoryManager::CreateDefaultMemManager() {
   return new DefaultJITMemoryManager();
 }
diff --git a/lib/ExecutionEngine/LLVMBuild.txt b/lib/ExecutionEngine/LLVMBuild.txt
index d426969..1f94a4f 100644
--- a/lib/ExecutionEngine/LLVMBuild.txt
+++ b/lib/ExecutionEngine/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = Interpreter JIT MCJIT RuntimeDyld
+subdirectories = Interpreter JIT MCJIT RuntimeDyld IntelJITEvents OProfileJIT
 
 [component_0]
 type = Library
diff --git a/lib/ExecutionEngine/MCJIT/CMakeLists.txt b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
index 2c0f8d6..fef7176 100644
--- a/lib/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMMCJIT
   MCJIT.cpp
   MCJITMemoryManager.cpp
-  Intercept.cpp
   )
diff --git a/lib/ExecutionEngine/MCJIT/Intercept.cpp b/lib/ExecutionEngine/MCJIT/Intercept.cpp
deleted file mode 100644
index f83f428..0000000
--- a/lib/ExecutionEngine/MCJIT/Intercept.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//===-- Intercept.cpp - System function interception routines -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// If a function call occurs to an external function, the JIT is designed to use
-// the dynamic loader interface to find a function to call.  This is useful for
-// calling system calls and library functions that are not available in LLVM.
-// Some system calls, however, need to be handled specially.  For this reason,
-// we intercept some of them here and use our own stubs to handle them.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCJIT.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Config/config.h"
-using namespace llvm;
-
-// AtExitHandlers - List of functions to call when the program exits,
-// registered with the atexit() library function.
-static std::vector<void (*)()> AtExitHandlers;
-
-/// runAtExitHandlers - Run any functions registered by the program's
-/// calls to atexit(3), which we intercept and store in
-/// AtExitHandlers.
-///
-static void runAtExitHandlers() {
-  while (!AtExitHandlers.empty()) {
-    void (*Fn)() = AtExitHandlers.back();
-    AtExitHandlers.pop_back();
-    Fn();
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Function stubs that are invoked instead of certain library calls
-//===----------------------------------------------------------------------===//
-
-// Force the following functions to be linked in to anything that uses the
-// JIT. This is a hack designed to work around the all-too-clever Glibc
-// strategy of making these functions work differently when inlined vs. when
-// not inlined, and hiding their real definitions in a separate archive file
-// that the dynamic linker can't see. For more info, search for
-// 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-#if defined(__linux__)
-#if defined(HAVE_SYS_STAT_H)
-#include <sys/stat.h>
-#endif
-#include <fcntl.h>
-#include <unistd.h>
-/* stat functions are redirecting to __xstat with a version number.  On x86-64
- * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
- * available as an exported symbol, so we have to add it explicitly.
- */
-namespace {
-class StatSymbols {
-public:
-  StatSymbols() {
-    sys::DynamicLibrary::AddSymbol("stat", (void*)(intptr_t)stat);
-    sys::DynamicLibrary::AddSymbol("fstat", (void*)(intptr_t)fstat);
-    sys::DynamicLibrary::AddSymbol("lstat", (void*)(intptr_t)lstat);
-    sys::DynamicLibrary::AddSymbol("stat64", (void*)(intptr_t)stat64);
-    sys::DynamicLibrary::AddSymbol("\x1stat64", (void*)(intptr_t)stat64);
-    sys::DynamicLibrary::AddSymbol("\x1open64", (void*)(intptr_t)open64);
-    sys::DynamicLibrary::AddSymbol("\x1lseek64", (void*)(intptr_t)lseek64);
-    sys::DynamicLibrary::AddSymbol("fstat64", (void*)(intptr_t)fstat64);
-    sys::DynamicLibrary::AddSymbol("lstat64", (void*)(intptr_t)lstat64);
-    sys::DynamicLibrary::AddSymbol("atexit", (void*)(intptr_t)atexit);
-    sys::DynamicLibrary::AddSymbol("mknod", (void*)(intptr_t)mknod);
-  }
-};
-}
-static StatSymbols initStatSymbols;
-#endif // __linux__
-
-// jit_exit - Used to intercept the "exit" library call.
-static void jit_exit(int Status) {
-  runAtExitHandlers();   // Run atexit handlers...
-  exit(Status);
-}
-
-// jit_atexit - Used to intercept the "atexit" library call.
-static int jit_atexit(void (*Fn)()) {
-  AtExitHandlers.push_back(Fn);    // Take note of atexit handler...
-  return 0;  // Always successful
-}
-
-static int jit_noop() {
-  return 0;
-}
-
-//===----------------------------------------------------------------------===//
-//
-/// getPointerToNamedFunction - This method returns the address of the specified
-/// function by using the dynamic loader interface.  As such it is only useful
-/// for resolving library symbols, not code generated symbols.
-///
-void *MCJIT::getPointerToNamedFunction(const std::string &Name,
-                                       bool AbortOnFailure) {
-  if (!isSymbolSearchingDisabled()) {
-    // Check to see if this is one of the functions we want to intercept.  Note,
-    // we cast to intptr_t here to silence a -pedantic warning that complains
-    // about casting a function pointer to a normal pointer.
-    if (Name == "exit") return (void*)(intptr_t)&jit_exit;
-    if (Name == "atexit") return (void*)(intptr_t)&jit_atexit;
-
-    // We should not invoke parent's ctors/dtors from generated main()!
-    // On Mingw and Cygwin, the symbol __main is resolved to
-    // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
-    // (and register wrong callee's dtors with atexit(3)).
-    // We expect ExecutionEngine::runStaticConstructorsDestructors()
-    // is called before ExecutionEngine::runFunctionAsMain() is called.
-    if (Name == "__main") return (void*)(intptr_t)&jit_noop;
-
-    const char *NameStr = Name.c_str();
-    // If this is an asm specifier, skip the sentinal.
-    if (NameStr[0] == 1) ++NameStr;
-
-    // If it's an external function, look it up in the process image...
-    void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
-    if (Ptr) return Ptr;
-
-    // If it wasn't found and if it starts with an underscore ('_') character,
-    // and has an asm specifier, try again without the underscore.
-    if (Name[0] == 1 && NameStr[0] == '_') {
-      Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
-      if (Ptr) return Ptr;
-    }
-
-    // Darwin/PPC adds $LDBLStub suffixes to various symbols like printf.  These
-    // are references to hidden visibility symbols that dlsym cannot resolve.
-    // If we have one of these, strip off $LDBLStub and try again.
-#if defined(__APPLE__) && defined(__ppc__)
-    if (Name.size() > 9 && Name[Name.size()-9] == '$' &&
-        memcmp(&Name[Name.size()-8], "LDBLStub", 8) == 0) {
-      // First try turning $LDBLStub into $LDBL128. If that fails, strip it off.
-      // This mirrors logic in libSystemStubs.a.
-      std::string Prefix = std::string(Name.begin(), Name.end()-9);
-      if (void *Ptr = getPointerToNamedFunction(Prefix+"$LDBL128", false))
-        return Ptr;
-      if (void *Ptr = getPointerToNamedFunction(Prefix, false))
-        return Ptr;
-    }
-#endif
-  }
-
-  /// If a LazyFunctionCreator is installed, use it to get/create the function.
-  if (LazyFunctionCreator)
-    if (void *RP = LazyFunctionCreator(Name))
-      return RP;
-
-  if (AbortOnFailure) {
-    report_fatal_error("Program used external function '"+Name+
-                      "' which could not be resolved!");
-  }
-  return 0;
-}
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 5f93a8d..cbb23d3 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -215,3 +215,23 @@ GenericValue MCJIT::runFunction(Function *F,
 
   llvm_unreachable("Full-featured argument passing not supported yet!");
 }
+
+void *MCJIT::getPointerToNamedFunction(const std::string &Name,
+                                       bool AbortOnFailure){
+  if (!isSymbolSearchingDisabled()) {
+    void *ptr = MemMgr->getPointerToNamedFunction(Name, false);
+    if (ptr)
+      return ptr;
+  }
+
+  /// If a LazyFunctionCreator is installed, use it to get/create the function.
+  if (LazyFunctionCreator)
+    if (void *RP = LazyFunctionCreator(Name))
+      return RP;
+
+  if (AbortOnFailure) {
+    report_fatal_error("Program used external function '"+Name+
+                      "' which could not be resolved!");
+  }
+  return 0;
+}
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 7f4ae77..2b3df98 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -67,6 +67,7 @@ public:
   ///
   virtual void *getPointerToNamedFunction(const std::string &Name,
                                           bool AbortOnFailure = true);
+
   /// mapSectionAddress - map a section to its target address space value.
   /// Map the address of a JIT section as returned from the memory manager
   /// to the address in the target process as the running code will see it.
diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
index ac8c155..dac8b26 100644
--- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
+++ b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
@@ -33,46 +33,17 @@ public:
 
   uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
                                unsigned SectionID) {
-    return JMM->allocateDataSection(Size, Alignment, SectionID);
+    return JMM->allocateSpace(Size, Alignment);
   }
 
   uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
                                unsigned SectionID) {
-    return JMM->allocateCodeSection(Size, Alignment, SectionID);
+    return JMM->allocateSpace(Size, Alignment);
   }
 
-  // Allocate ActualSize bytes, or more, for the named function. Return
-  // a pointer to the allocated memory and update Size to reflect how much
-  // memory was acutally allocated.
-  uint8_t *startFunctionBody(const char *Name, uintptr_t &Size) {
-    // FIXME: This should really reference the MCAsmInfo to get the global
-    //        prefix.
-    if (Name[0] == '_') ++Name;
-    Function *F = M->getFunction(Name);
-    // Some ObjC names have a prefixed \01 in the IR. If we failed to find
-    // the symbol and it's of the ObjC conventions (starts with "-" or 
-    // "+"), try prepending a \01 and see if we can find it that way.
-    if (!F && (Name[0] == '-' || Name[0] == '+'))
-      F = M->getFunction((Twine("\1") + Name).str());
-    assert(F && "No matching function in JIT IR Module!");
-    return JMM->startFunctionBody(F, Size);
-  }
-
-  // Mark the end of the function, including how much of the allocated
-  // memory was actually used.
-  void endFunctionBody(const char *Name, uint8_t *FunctionStart,
-                       uint8_t *FunctionEnd) {
-    // FIXME: This should really reference the MCAsmInfo to get the global
-    //        prefix.
-    if (Name[0] == '_') ++Name;
-    Function *F = M->getFunction(Name);
-    // Some ObjC names have a prefixed \01 in the IR. If we failed to find
-    // the symbol and it's of the ObjC conventions (starts with "-" or
-    // "+"), try prepending a \01 and see if we can find it that way.
-    if (!F && (Name[0] == '-' || Name[0] == '+'))
-      F = M->getFunction((Twine("\1") + Name).str());
-    assert(F && "No matching function in JIT IR Module!");
-    JMM->endFunctionBody(F, FunctionStart, FunctionEnd);
+  virtual void *getPointerToNamedFunction(const std::string &Name,
+                                          bool AbortOnFailure = true) {
+    return JMM->getPointerToNamedFunction(Name, AbortOnFailure);
   }
 
 };
diff --git a/lib/ExecutionEngine/Makefile b/lib/ExecutionEngine/Makefile
index 9a649a5..c26e0ad 100644
--- a/lib/ExecutionEngine/Makefile
+++ b/lib/ExecutionEngine/Makefile
@@ -8,6 +8,17 @@
 ##===----------------------------------------------------------------------===##
 LEVEL = ../..
 LIBRARYNAME = LLVMExecutionEngine
+
+include $(LEVEL)/Makefile.config
+
 PARALLEL_DIRS = Interpreter JIT MCJIT RuntimeDyld
 
-include $(LEVEL)/Makefile.common
+ifeq ($(USE_INTEL_JITEVENTS), 1)
+PARALLEL_DIRS += IntelJITEvents
+endif
+
+ifeq ($(USE_OPROFILE), 1)
+PARALLEL_DIRS += OProfileJIT
+endif
+
+include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/lib/ExecutionEngine/OProfileJIT/CMakeLists.txt b/lib/ExecutionEngine/OProfileJIT/CMakeLists.txt
new file mode 100644
index 0000000..d585136
--- /dev/null
+++ b/lib/ExecutionEngine/OProfileJIT/CMakeLists.txt
@@ -0,0 +1,7 @@
+
+include_directories( ${LLVM_OPROFILE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMOProfileJIT
+  OProfileJITEventListener.cpp
+  OProfileWrapper.cpp
+  )
diff --git a/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt
new file mode 100644
index 0000000..4516dfa
--- /dev/null
+++ b/lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/ExecutionEngine/OProfileJIT/LLVMBuild.txt ----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+
+[component_0]
+type = Library
+name = OProfileJIT
+parent = ExecutionEngine
diff --git a/lib/ExecutionEngine/OProfileJIT/Makefile b/lib/ExecutionEngine/OProfileJIT/Makefile
new file mode 100644
index 0000000..fd3adce
--- /dev/null
+++ b/lib/ExecutionEngine/OProfileJIT/Makefile
@@ -0,0 +1,18 @@
+##===- lib/ExecutionEngine/OProfileJIT/Makefile ------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMOProfileJIT
+
+include $(LEVEL)/Makefile.config
+
+SOURCES += OProfileJITEventListener.cpp \
+  OProfileWrapper.cpp
+CPPFLAGS += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 9a9ed6d..e6142e3 100644
--- a/lib/ExecutionEngine/JIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -7,51 +7,55 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines a JITEventListener object that calls into OProfile to tell
-// it about JITted functions.  For now, we only record function names and sizes,
-// but eventually we'll also record line number information.
-//
-// See http://oprofile.sourceforge.net/doc/devel/jit-interface.html for the
-// definition of the interface we're using.
+// This file defines a JITEventListener object that uses OProfileWrapper to tell
+// oprofile about JITted functions, including source line information.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+
 #define DEBUG_TYPE "oprofile-jit-event-listener"
 #include "llvm/Function.h"
-#include "llvm/Metadata.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Errno.h"
-#include "llvm/Config/config.h"
-#include <stddef.h>
-using namespace llvm;
+#include "EventListenerCommon.h"
 
-#if USE_OPROFILE
+#include <dirent.h>
+#include <fcntl.h>
 
-#include <opagent.h>
+using namespace llvm;
+using namespace llvm::jitprofiling;
 
 namespace {
 
 class OProfileJITEventListener : public JITEventListener {
-  op_agent_t Agent;
+  OProfileWrapper& Wrapper;
+
+  void initialize();
+
 public:
-  OProfileJITEventListener();
+  OProfileJITEventListener(OProfileWrapper& LibraryWrapper)
+  : Wrapper(LibraryWrapper) {
+    initialize();
+  }
+
   ~OProfileJITEventListener();
 
   virtual void NotifyFunctionEmitted(const Function &F,
-                                     void *FnStart, size_t FnSize,
-                                     const EmittedFunctionDetails &Details);
+                                void *FnStart, size_t FnSize,
+                                const JITEvent_EmittedFunctionDetails &Details);
+
   virtual void NotifyFreeingMachineCode(void *OldPtr);
 };
 
-OProfileJITEventListener::OProfileJITEventListener()
-    : Agent(op_open_agent()) {
-  if (Agent == NULL) {
+void OProfileJITEventListener::initialize() {
+  if (!Wrapper.op_open_agent()) {
     const std::string err_str = sys::StrError();
     DEBUG(dbgs() << "Failed to connect to OProfile agent: " << err_str << "\n");
   } else {
@@ -60,8 +64,8 @@ OProfileJITEventListener::OProfileJITEventListener()
 }
 
 OProfileJITEventListener::~OProfileJITEventListener() {
-  if (Agent != NULL) {
-    if (op_close_agent(Agent) == -1) {
+  if (Wrapper.isAgentAvailable()) {
+    if (Wrapper.op_close_agent() == -1) {
       const std::string err_str = sys::StrError();
       DEBUG(dbgs() << "Failed to disconnect from OProfile agent: "
                    << err_str << "\n");
@@ -71,22 +75,6 @@ OProfileJITEventListener::~OProfileJITEventListener() {
   }
 }
 
-class FilenameCache {
-  // Holds the filename of each Scope, so that we can pass a null-terminated
-  // string into oprofile.  Use an AssertingVH rather than a ValueMap because we
-  // shouldn't be modifying any MDNodes while this map is alive.
-  DenseMap<AssertingVH<MDNode>, std::string> Filenames;
-
- public:
-  const char *getFilename(MDNode *Scope) {
-    std::string &Filename = Filenames[Scope];
-    if (Filename.empty()) {
-      Filename = DIScope(Scope).getFilename();
-    }
-    return Filename.c_str();
-  }
-};
-
 static debug_line_info LineStartToOProfileFormat(
     const MachineFunction &MF, FilenameCache &Filenames,
     uintptr_t Address, DebugLoc Loc) {
@@ -103,9 +91,9 @@ static debug_line_info LineStartToOProfileFormat(
 // Adds the just-emitted function to the symbol table.
 void OProfileJITEventListener::NotifyFunctionEmitted(
     const Function &F, void *FnStart, size_t FnSize,
-    const EmittedFunctionDetails &Details) {
+    const JITEvent_EmittedFunctionDetails &Details) {
   assert(F.hasName() && FnStart != 0 && "Bad symbol to add");
-  if (op_write_native_code(Agent, F.getName().data(),
+  if (Wrapper.op_write_native_code(F.getName().data(),
                            reinterpret_cast<uint64_t>(FnStart),
                            FnStart, FnSize) == -1) {
     DEBUG(dbgs() << "Failed to tell OProfile about native function "
@@ -151,8 +139,8 @@ void OProfileJITEventListener::NotifyFunctionEmitted(
     // line info's address to include the start of the function.
     LineInfo[0].vma = reinterpret_cast<uintptr_t>(FnStart);
 
-    if (op_write_debug_line_info(Agent, FnStart,
-                                 LineInfo.size(), &*LineInfo.begin()) == -1) {
+    if (Wrapper.op_write_debug_line_info(FnStart, LineInfo.size(),
+                                      &*LineInfo.begin()) == -1) {
       DEBUG(dbgs()
             << "Failed to tell OProfile about line numbers for native function "
             << F.getName() << " at ["
@@ -164,7 +152,7 @@ void OProfileJITEventListener::NotifyFunctionEmitted(
 // Removes the being-deleted function from the symbol table.
 void OProfileJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
   assert(FnStart && "Invalid function pointer");
-  if (op_unload_native_code(Agent, reinterpret_cast<uint64_t>(FnStart)) == -1) {
+  if (Wrapper.op_unload_native_code(reinterpret_cast<uint64_t>(FnStart)) == -1) {
     DEBUG(dbgs()
           << "Failed to tell OProfile about unload of native function at "
           << FnStart << "\n");
@@ -174,19 +162,16 @@ void OProfileJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
 }  // anonymous namespace.
 
 namespace llvm {
-JITEventListener *createOProfileJITEventListener() {
-  return new OProfileJITEventListener;
-}
+JITEventListener *JITEventListener::createOProfileJITEventListener() {
+  static OwningPtr<OProfileWrapper> JITProfilingWrapper(new OProfileWrapper);
+  return new OProfileJITEventListener(*JITProfilingWrapper);
 }
 
-#else  // USE_OPROFILE
-
-namespace llvm {
-// By defining this to return NULL, we can let clients call it unconditionally,
-// even if they haven't configured with the OProfile libraries.
-JITEventListener *createOProfileJITEventListener() {
-  return NULL;
+// for testing
+JITEventListener *JITEventListener::createOProfileJITEventListener(
+                                      OProfileWrapper* TestImpl) {
+  return new OProfileJITEventListener(*TestImpl);
 }
-}  // namespace llvm
 
-#endif  // USE_OPROFILE
+} // namespace llvm
+
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
new file mode 100644
index 0000000..d67f537
--- /dev/null
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -0,0 +1,263 @@
+//===-- OProfileWrapper.cpp - OProfile JIT API Wrapper implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interface in OProfileWrapper.h. It is responsible
+// for loading the opagent dynamic library when the first call to an op_
+// function occurs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/OProfileWrapper.h"
+
+#define DEBUG_TYPE "oprofile-wrapper"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/ADT/SmallString.h"
+
+#include <sstream>
+#include <cstring>
+#include <stddef.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+namespace {
+
+// Global mutex to ensure a single thread initializes oprofile agent.
+llvm::sys::Mutex OProfileInitializationMutex;
+
+} // anonymous namespace
+
+namespace llvm {
+
+OProfileWrapper::OProfileWrapper()
+: Agent(0),
+  OpenAgentFunc(0),
+  CloseAgentFunc(0),
+  WriteNativeCodeFunc(0),
+  WriteDebugLineInfoFunc(0),
+  UnloadNativeCodeFunc(0),
+  MajorVersionFunc(0),
+  MinorVersionFunc(0),
+  IsOProfileRunningFunc(0),
+  Initialized(false) {
+}
+
+bool OProfileWrapper::initialize() {
+  using namespace llvm;
+  using namespace llvm::sys;
+
+  MutexGuard Guard(OProfileInitializationMutex);
+
+  if (Initialized)
+    return OpenAgentFunc != 0;
+
+  Initialized = true;
+
+  // If the oprofile daemon is not running, don't load the opagent library
+  if (!isOProfileRunning()) {
+    DEBUG(dbgs() << "OProfile daemon is not detected.\n");
+    return false;
+  }
+
+  std::string error;
+  if(!DynamicLibrary::LoadLibraryPermanently("libopagent.so", &error)) {
+    DEBUG(dbgs()
+            << "OProfile connector library libopagent.so could not be loaded: "
+            << error << "\n");
+  }
+
+  // Get the addresses of the opagent functions
+  OpenAgentFunc = (op_open_agent_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_open_agent");
+  CloseAgentFunc = (op_close_agent_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_close_agent");
+  WriteNativeCodeFunc = (op_write_native_code_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_write_native_code");
+  WriteDebugLineInfoFunc = (op_write_debug_line_info_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_write_debug_line_info");
+  UnloadNativeCodeFunc = (op_unload_native_code_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_unload_native_code");
+  MajorVersionFunc = (op_major_version_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_major_version");
+  MinorVersionFunc = (op_major_version_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_minor_version");
+
+  // With missing functions, we can do nothing
+  if (!OpenAgentFunc
+      || !CloseAgentFunc
+      || !WriteNativeCodeFunc
+      || !WriteDebugLineInfoFunc
+      || !UnloadNativeCodeFunc) {
+    OpenAgentFunc = 0;
+    CloseAgentFunc = 0;
+    WriteNativeCodeFunc = 0;
+    WriteDebugLineInfoFunc = 0;
+    UnloadNativeCodeFunc = 0;
+    return false;
+  }
+
+  return true;
+}
+
+bool OProfileWrapper::isOProfileRunning() {
+  if (IsOProfileRunningFunc != 0)
+    return IsOProfileRunningFunc();
+  return checkForOProfileProcEntry();
+}
+
+bool OProfileWrapper::checkForOProfileProcEntry() {
+  DIR* ProcDir;
+
+  ProcDir = opendir("/proc");
+  if (!ProcDir)
+    return false;
+
+  // Walk the /proc tree looking for the oprofile daemon
+  struct dirent* Entry;
+  while (0 != (Entry = readdir(ProcDir))) {
+    if (Entry->d_type == DT_DIR) {
+      // Build a path from the current entry name
+      SmallString<256> CmdLineFName;
+      raw_svector_ostream(CmdLineFName) << "/proc/" << Entry->d_name
+                                        << "/cmdline";
+
+      // Open the cmdline file
+      int CmdLineFD = open(CmdLineFName.c_str(), S_IRUSR);
+      if (CmdLineFD != -1) {
+        char    ExeName[PATH_MAX+1];
+        char*   BaseName = 0;
+
+        // Read the cmdline file
+        ssize_t NumRead = read(CmdLineFD, ExeName, PATH_MAX+1);
+        close(CmdLineFD);
+        ssize_t Idx = 0;
+
+        // Find the terminator for the first string
+        while (Idx < NumRead-1 && ExeName[Idx] != 0) {
+          Idx++;
+        }
+
+        // Go back to the last non-null character
+        Idx--;
+
+        // Find the last path separator in the first string
+        while (Idx > 0) {
+          if (ExeName[Idx] == '/') {
+            BaseName = ExeName + Idx + 1;
+            break;
+          }
+          Idx--;
+        }
+
+        // Test this to see if it is the oprofile daemon
+        if (BaseName != 0 && !strcmp("oprofiled", BaseName)) {
+          // If it is, we're done
+          closedir(ProcDir);
+          return true;
+        }
+      }
+    }
+  }
+
+  // We've looked through all the files and didn't find the daemon
+  closedir(ProcDir);
+  return false;
+}
+
+bool OProfileWrapper::op_open_agent() {
+  if (!Initialized)
+    initialize();
+
+  if (OpenAgentFunc != 0) {
+    Agent = OpenAgentFunc();
+    return Agent != 0;
+  }
+
+  return false;
+}
+
+int OProfileWrapper::op_close_agent() {
+  if (!Initialized)
+    initialize();
+
+  int ret = -1;
+  if (Agent && CloseAgentFunc) {
+    ret = CloseAgentFunc(Agent);
+    if (ret == 0) {
+      Agent = 0;
+    }
+  }
+  return ret;
+}
+
+bool OProfileWrapper::isAgentAvailable() {
+  return Agent != 0;
+}
+
+int OProfileWrapper::op_write_native_code(const char* Name,
+                                          uint64_t Addr,
+                                          void const* Code,
+                                          const unsigned int Size) {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && WriteNativeCodeFunc)
+    return WriteNativeCodeFunc(Agent, Name, Addr, Code, Size);
+
+  return -1;
+}
+
+int OProfileWrapper::op_write_debug_line_info(
+  void const* Code,
+  size_t NumEntries,
+  struct debug_line_info const* Info) {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && WriteDebugLineInfoFunc)
+    return WriteDebugLineInfoFunc(Agent, Code, NumEntries, Info);
+
+  return -1;
+}
+
+int OProfileWrapper::op_major_version() {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && MajorVersionFunc)
+    return MajorVersionFunc();
+
+  return -1;
+}
+
+int OProfileWrapper::op_minor_version() {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && MinorVersionFunc)
+    return MinorVersionFunc();
+
+  return -1;
+}
+
+int  OProfileWrapper::op_unload_native_code(uint64_t Addr) {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && UnloadNativeCodeFunc)
+    return UnloadNativeCodeFunc(Agent, Addr);
+
+  return -1;
+}
+
+} // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 2896c2d..ff4a2c8 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -26,45 +26,290 @@ RuntimeDyldImpl::~RuntimeDyldImpl() {}
 
 namespace llvm {
 
-void RuntimeDyldImpl::extractFunction(StringRef Name, uint8_t *StartAddress,
-                                      uint8_t *EndAddress) {
-  // FIXME: DEPRECATED in favor of by-section allocation.
-  // Allocate memory for the function via the memory manager.
-  uintptr_t Size = EndAddress - StartAddress + 1;
-  uintptr_t AllocSize = Size;
-  uint8_t *Mem = MemMgr->startFunctionBody(Name.data(), AllocSize);
-  assert(Size >= (uint64_t)(EndAddress - StartAddress + 1) &&
-         "Memory manager failed to allocate enough memory!");
-  // Copy the function payload into the memory block.
-  memcpy(Mem, StartAddress, Size);
-  MemMgr->endFunctionBody(Name.data(), Mem, Mem + Size);
-  // Remember where we put it.
-  unsigned SectionID = Sections.size();
-  Sections.push_back(sys::MemoryBlock(Mem, Size));
 
-  // Default the assigned address for this symbol to wherever this
-  // allocated it.
-  SymbolTable[Name] = SymbolLoc(SectionID, 0);
-  DEBUG(dbgs() << "    allocated to [" << Mem << ", " << Mem + Size << "]\n");
-}
 
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
+  // First, resolve relocations assotiated with external symbols.
+  resolveSymbols();
+
   // Just iterate over the sections we have and resolve all the relocations
   // in them. Gross overkill, but it gets the job done.
   for (int i = 0, e = Sections.size(); i != e; ++i) {
-    reassignSectionAddress(i, SectionLoadAddress[i]);
+    reassignSectionAddress(i, Sections[i].LoadAddress);
   }
 }
 
 void RuntimeDyldImpl::mapSectionAddress(void *LocalAddress,
                                         uint64_t TargetAddress) {
-  assert(SectionLocalMemToID.count(LocalAddress) &&
-         "Attempting to remap address of unknown section!");
-  unsigned SectionID = SectionLocalMemToID[LocalAddress];
-  reassignSectionAddress(SectionID, TargetAddress);
+  for (unsigned i = 0, e = Sections.size(); i != e; ++i) {
+    if (Sections[i].Address == LocalAddress) {
+      reassignSectionAddress(i, TargetAddress);
+      return;
+    }
+  }
+  llvm_unreachable("Attempting to remap address of unknown section!");
+}
+
+bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
+  // FIXME: ObjectFile don't modify MemoryBuffer.
+  //        It should use const MemoryBuffer as parameter.
+  ObjectFile *obj = ObjectFile::
+                      createObjectFile(const_cast<MemoryBuffer*>(InputBuffer));
+
+  Arch = (Triple::ArchType)obj->getArch();
+
+  LocalSymbolMap LocalSymbols;     // Functions and data symbols from the
+                                   // object file.
+  ObjSectionToIDMap LocalSections; // Used sections from the object file
+
+  error_code err;
+
+
+  // Parse symbols
+  DEBUG(dbgs() << "Parse symbols:\n");
+  for (symbol_iterator it = obj->begin_symbols(), itEnd = obj->end_symbols();
+       it != itEnd; it.increment(err)) {
+    if (err) break;
+    object::SymbolRef::Type SymType;
+    StringRef Name;
+    if ((bool)(err = it->getType(SymType))) break;
+    if ((bool)(err = it->getName(Name))) break;
+
+    if (SymType == object::SymbolRef::ST_Function ||
+        SymType == object::SymbolRef::ST_Data) {
+      uint64_t FileOffset;
+      uint32_t flags;
+      StringRef sData;
+      section_iterator sIt = obj->end_sections();
+      if ((bool)(err = it->getFileOffset(FileOffset))) break;
+      if ((bool)(err = it->getFlags(flags))) break;
+      if ((bool)(err = it->getSection(sIt))) break;
+      if (sIt == obj->end_sections()) continue;
+      if ((bool)(err = sIt->getContents(sData))) break;
+      const uint8_t* SymPtr = (const uint8_t*)InputBuffer->getBufferStart() +
+                              (uintptr_t)FileOffset;
+      uintptr_t SectOffset = (uintptr_t)(SymPtr - (const uint8_t*)sData.begin());
+      unsigned SectionID =
+        findOrEmitSection(*sIt,
+                          SymType == object::SymbolRef::ST_Function,
+                          LocalSections);
+      bool isGlobal = flags & SymbolRef::SF_Global;
+      LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
+      DEBUG(dbgs() << "\tFileOffset: " << format("%p", (uintptr_t)FileOffset)
+                   << " flags: " << flags
+                   << " SID: " << SectionID
+                   << " Offset: " << format("%p", SectOffset));
+      if (isGlobal)
+        SymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
+    }
+    DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name << "\n");
+  }
+  if (err) {
+    report_fatal_error(err.message());
+  }
+
+  // Parse and proccess relocations
+  DEBUG(dbgs() << "Parse relocations:\n");
+  for (section_iterator sIt = obj->begin_sections(),
+       sItEnd = obj->end_sections(); sIt != sItEnd; sIt.increment(err)) {
+    if (err) break;
+    bool isFirstRelocation = true;
+    unsigned SectionID = 0;
+    StubMap Stubs;
+
+    for (relocation_iterator it = sIt->begin_relocations(),
+         itEnd = sIt->end_relocations(); it != itEnd; it.increment(err)) {
+      if (err) break;
+
+      // If it's first relocation in this section, find its SectionID
+      if (isFirstRelocation) {
+        SectionID = findOrEmitSection(*sIt, true, LocalSections);
+        DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
+        isFirstRelocation = false;
+      }
+
+      ObjRelocationInfo RI;
+      RI.SectionID = SectionID;
+      if ((bool)(err = it->getAdditionalInfo(RI.AdditionalInfo))) break;
+      if ((bool)(err = it->getOffset(RI.Offset))) break;
+      if ((bool)(err = it->getSymbol(RI.Symbol))) break;
+      if ((bool)(err = it->getType(RI.Type))) break;
+
+      DEBUG(dbgs() << "\t\tAddend: " << RI.AdditionalInfo
+                   << " Offset: " << format("%p", (uintptr_t)RI.Offset)
+                   << " Type: " << (uint32_t)(RI.Type & 0xffffffffL)
+                   << "\n");
+      processRelocationRef(RI, *obj, LocalSections, LocalSymbols, Stubs);
+    }
+    if (err) {
+      report_fatal_error(err.message());
+    }
+  }
+  return false;
+}
+
+unsigned RuntimeDyldImpl::emitSection(const SectionRef &Section,
+                                      bool IsCode) {
+
+  unsigned StubBufSize = 0,
+           StubSize = getMaxStubSize();
+  error_code err;
+  if (StubSize > 0) {
+    for (relocation_iterator it = Section.begin_relocations(),
+         itEnd = Section.end_relocations(); it != itEnd; it.increment(err))
+      StubBufSize += StubSize;
+  }
+  StringRef data;
+  uint64_t Alignment64;
+  if ((bool)(err = Section.getContents(data))) report_fatal_error(err.message());
+  if ((bool)(err = Section.getAlignment(Alignment64)))
+    report_fatal_error(err.message());
+
+  unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
+  unsigned DataSize = data.size();
+  unsigned Allocate = DataSize + StubBufSize;
+  unsigned SectionID = Sections.size();
+  const char *pData = data.data();
+  uint8_t *Addr = IsCode
+    ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID)
+    : MemMgr->allocateDataSection(Allocate, Alignment, SectionID);
+
+  memcpy(Addr, pData, DataSize);
+  DEBUG(dbgs() << "emitSection SectionID: " << SectionID
+               << " obj addr: " << format("%p", pData)
+               << " new addr: " << format("%p", Addr)
+               << " DataSize: " << DataSize
+               << " StubBufSize: " << StubBufSize
+               << " Allocate: " << Allocate
+               << "\n");
+  Sections.push_back(SectionEntry(Addr, Allocate, DataSize,(uintptr_t)pData));
+  return SectionID;
+}
+
+unsigned RuntimeDyldImpl::
+findOrEmitSection(const SectionRef &Section, bool IsCode,
+                  ObjSectionToIDMap &LocalSections) {
+
+  unsigned SectionID = 0;
+  ObjSectionToIDMap::iterator sIDIt = LocalSections.find(Section);
+  if (sIDIt != LocalSections.end())
+    SectionID = sIDIt->second;
+  else {
+    SectionID = emitSection(Section, IsCode);
+    LocalSections[Section] = SectionID;
+  }
+  return SectionID;
+}
+
+void RuntimeDyldImpl::AddRelocation(const RelocationValueRef &Value,
+                                   unsigned SectionID, uintptr_t Offset,
+                                   uint32_t RelType) {
+  DEBUG(dbgs() << "AddRelocation SymNamePtr: " << format("%p", Value.SymbolName)
+               << " SID: " << Value.SectionID
+               << " Addend: " << format("%p", Value.Addend)
+               << " Offset: " << format("%p", Offset)
+               << " RelType: " << format("%x", RelType)
+               << "\n");
+
+  if (Value.SymbolName == 0) {
+    Relocations[Value.SectionID].push_back(RelocationEntry(
+      SectionID,
+      Offset,
+      RelType,
+      Value.Addend));
+  } else
+    SymbolRelocations[Value.SymbolName].push_back(RelocationEntry(
+      SectionID,
+      Offset,
+      RelType,
+      Value.Addend));
+}
+
+uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
+  // TODO: There is only ARM far stub now. We should add the Thumb stub,
+  // and stubs for branches Thumb - ARM and ARM - Thumb.
+  if (Arch == Triple::arm) {
+    uint32_t *StubAddr = (uint32_t*)Addr;
+    *StubAddr = 0xe51ff004; // ldr pc,<label>
+    return (uint8_t*)++StubAddr;
+  }
+  else
+    return Addr;
 }
 
+// Assign an address to a symbol name and resolve all the relocations
+// associated with it.
+void RuntimeDyldImpl::reassignSectionAddress(unsigned SectionID,
+                                             uint64_t Addr) {
+  // The address to use for relocation resolution is not
+  // the address of the local section buffer. We must be doing
+  // a remote execution environment of some sort. Re-apply any
+  // relocations referencing this section with the given address.
+  //
+  // Addr is a uint64_t because we can't assume the pointer width
+  // of the target is the same as that of the host. Just use a generic
+  // "big enough" type.
+  Sections[SectionID].LoadAddress = Addr;
+  DEBUG(dbgs() << "Resolving relocations Section #" << SectionID
+          << "\t" << format("%p", (uint8_t *)Addr)
+          << "\n");
+  resolveRelocationList(Relocations[SectionID], Addr);
+}
+
+void RuntimeDyldImpl::resolveRelocationEntry(const RelocationEntry &RE,
+                                             uint64_t Value) {
+    uint8_t *Target = Sections[RE.SectionID].Address + RE.Offset;
+    DEBUG(dbgs() << "\tSectionID: " << RE.SectionID
+          << " + " << RE.Offset << " (" << format("%p", Target) << ")"
+          << " Data: " << RE.Data
+          << " Addend: " << RE.Addend
+          << "\n");
+
+    resolveRelocation(Target, Sections[RE.SectionID].LoadAddress + RE.Offset,
+                      Value, RE.Data, RE.Addend);
+}
+
+void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
+                                            uint64_t Value) {
+  for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
+    resolveRelocationEntry(Relocs[i], Value);
+  }
+}
+
+// resolveSymbols - Resolve any relocations to the specified symbols if
+// we know where it lives.
+void RuntimeDyldImpl::resolveSymbols() {
+  StringMap<RelocationList>::iterator it = SymbolRelocations.begin(),
+                                      itEnd = SymbolRelocations.end();
+  for (; it != itEnd; it++) {
+    StringRef Name = it->first();
+    RelocationList &Relocs = it->second;
+    StringMap<SymbolLoc>::const_iterator Loc = SymbolTable.find(Name);
+    if (Loc == SymbolTable.end()) {
+      // This is an external symbol, try to get it address from
+      // MemoryManager.
+      uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(),
+                                                                   true);
+      DEBUG(dbgs() << "Resolving relocations Name: " << Name
+              << "\t" << format("%p", Addr)
+              << "\n");
+      resolveRelocationList(Relocs, (uintptr_t)Addr);
+    } else {
+      // Change the relocation to be section relative rather than symbol
+      // relative and move it to the resolved relocation list.
+      DEBUG(dbgs() << "Resolving symbol '" << Name << "'\n");
+      for (int i = 0, e = Relocs.size(); i != e; ++i) {
+        RelocationEntry Entry = Relocs[i];
+        Entry.Addend += Loc->second.second;
+        Relocations[Loc->second.first].push_back(Entry);
+      }
+      Relocs.clear();
+    }
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // RuntimeDyld class implementation
 RuntimeDyld::RuntimeDyld(RTDyldMemoryManager *mm) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index e15b200..9351b6c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -25,222 +25,58 @@ using namespace llvm::object;
 
 namespace llvm {
 
-namespace {
 
-// FIXME: this function should probably not live here...
-//
-// Returns the name and address of an unrelocated symbol in an ELF section
-void getSymbolInfo(symbol_iterator Sym, uint64_t &Addr, StringRef &Name) {
-  //FIXME: error checking here required to catch corrupt ELF objects...
-  error_code Err = Sym->getName(Name);
-
-  uint64_t AddrInSection;
-  Err = Sym->getAddress(AddrInSection);
-
-  SectionRef empty_section;
-  section_iterator Section(empty_section);
-  Err = Sym->getSection(Section);
-
-  StringRef SectionContents;
-  Section->getContents(SectionContents);
-
-  Addr = reinterpret_cast<uint64_t>(SectionContents.data()) + AddrInSection;
-}
-
-}
-
-bool RuntimeDyldELF::loadObject(MemoryBuffer *InputBuffer) {
-  if (!isCompatibleFormat(InputBuffer))
-    return true;
-
-  OwningPtr<ObjectFile> Obj(ObjectFile::createELFObjectFile(InputBuffer));
-
-  Arch = Obj->getArch();
-
-  // Map address in the Object file image to function names
-  IntervalMap<uint64_t, StringRef>::Allocator A;
-  IntervalMap<uint64_t, StringRef> FuncMap(A);
-
-  // This is a bit of a hack.  The ObjectFile we've just loaded reports
-  // section addresses as 0 and doesn't provide access to the section
-  // offset (from which we could calculate the address.  Instead,
-  // we're storing the address when it comes up in the ST_Debug case
-  // below.
-  //
-  StringMap<uint64_t> DebugSymbolMap;
-
-  symbol_iterator SymEnd = Obj->end_symbols();
-  error_code Err;
-  for (symbol_iterator Sym = Obj->begin_symbols();
-       Sym != SymEnd; Sym.increment(Err)) {
-    SymbolRef::Type Type;
-    Sym->getType(Type);
-    if (Type == SymbolRef::ST_Function) {
-      StringRef Name;
-      uint64_t Addr;
-      getSymbolInfo(Sym, Addr, Name);
-
-      uint64_t Size;
-      Err = Sym->getSize(Size);
-
-      uint8_t *Start;
-      uint8_t *End;
-      Start = reinterpret_cast<uint8_t*>(Addr);
-      End   = reinterpret_cast<uint8_t*>(Addr + Size - 1);
-
-      extractFunction(Name, Start, End);
-      FuncMap.insert(Addr, Addr + Size - 1, Name);
-    } else if (Type == SymbolRef::ST_Debug) {
-      // This case helps us find section addresses
-      StringRef Name;
-      uint64_t Addr;
-      getSymbolInfo(Sym, Addr, Name);
-      DebugSymbolMap[Name] = Addr;
-    }
-  }
-
-  // Iterate through the relocations for this object
-  section_iterator SecEnd = Obj->end_sections();
-  for (section_iterator Sec = Obj->begin_sections();
-       Sec != SecEnd; Sec.increment(Err)) {
-    StringRef SecName;
-    uint64_t  SecAddr;
-    Sec->getName(SecName);
-    // Ignore sections that aren't in our map
-    if (DebugSymbolMap.find(SecName) == DebugSymbolMap.end()) {
-      continue;
-    }
-    SecAddr = DebugSymbolMap[SecName];
-    relocation_iterator RelEnd = Sec->end_relocations();
-    for (relocation_iterator Rel = Sec->begin_relocations();
-         Rel != RelEnd; Rel.increment(Err)) {
-      uint64_t RelOffset;
-      uint64_t RelType;
-      int64_t RelAddend;
-      SymbolRef RelSym;
-      StringRef SymName;
-      uint64_t SymAddr;
-      uint64_t SymOffset;
-
-      Rel->getAddress(RelOffset);
-      Rel->getType(RelType);
-      Rel->getAdditionalInfo(RelAddend);
-      Rel->getSymbol(RelSym);
-      RelSym.getName(SymName);
-      RelSym.getAddress(SymAddr);
-      RelSym.getFileOffset(SymOffset);
-
-      // If this relocation is inside a function, we want to store the
-      // function name and a function-relative offset
-      IntervalMap<uint64_t, StringRef>::iterator ContainingFunc
-        = FuncMap.find(SecAddr + RelOffset);
-      if (ContainingFunc.valid()) {
-        // Re-base the relocation to make it relative to the target function
-        RelOffset = (SecAddr + RelOffset) - ContainingFunc.start();
-        Relocations[SymName].push_back(RelocationEntry(ContainingFunc.value(),
-                                                       RelOffset,
-                                                       RelType,
-                                                       RelAddend,
-                                                       true));
-      } else {
-        Relocations[SymName].push_back(RelocationEntry(SecName,
-                                                       RelOffset,
-                                                       RelType,
-                                                       RelAddend,
-                                                       false));
-      }
-    }
-  }
-  return false;
-}
-
-void RuntimeDyldELF::resolveRelocations() {
-  // FIXME: deprecated. should be changed to use the by-section
-  // allocation and relocation scheme.
-
-  // Just iterate over the symbols in our symbol table and assign their
-  // addresses.
-  StringMap<SymbolLoc>::iterator i = SymbolTable.begin();
-  StringMap<SymbolLoc>::iterator e = SymbolTable.end();
-  for (;i != e; ++i) {
-    assert (i->getValue().second == 0 && "non-zero offset in by-function sym!");
-    reassignSymbolAddress(i->getKey(),
-                          (uint8_t*)Sections[i->getValue().first].base());
-  }
-}
-
-void RuntimeDyldELF::resolveX86_64Relocation(StringRef Name,
-                                             uint8_t *Addr,
-                                             const RelocationEntry &RE) {
-  uint8_t *TargetAddr;
-  if (RE.IsFunctionRelative) {
-    StringMap<SymbolLoc>::const_iterator Loc = SymbolTable.find(RE.Target);
-    assert(Loc != SymbolTable.end() && "Function for relocation not found");
-    TargetAddr =
-      reinterpret_cast<uint8_t*>(Sections[Loc->second.first].base()) +
-      Loc->second.second + RE.Offset;
-  } else {
-    // FIXME: Get the address of the target section and add that to RE.Offset
-    llvm_unreachable("Non-function relocation not implemented yet!");
-  }
-
-  switch (RE.Type) {
-  default: llvm_unreachable("Relocation type not implemented yet!");
+void RuntimeDyldELF::resolveX86_64Relocation(uint8_t *LocalAddress,
+                                             uint64_t FinalAddress,
+                                             uint64_t Value,
+                                             uint32_t Type,
+                                             int64_t Addend) {
+  switch (Type) {
+  default:
+    llvm_unreachable("Relocation type not implemented yet!");
+  break;
   case ELF::R_X86_64_64: {
-    uint8_t **Target = reinterpret_cast<uint8_t**>(TargetAddr);
-    *Target = Addr + RE.Addend;
+    uint64_t *Target = (uint64_t*)(LocalAddress);
+    *Target = Value + Addend;
     break;
   }
   case ELF::R_X86_64_32:
   case ELF::R_X86_64_32S: {
-    uint64_t Value = reinterpret_cast<uint64_t>(Addr) + RE.Addend;
+    Value += Addend;
     // FIXME: Handle the possibility of this assertion failing
-    assert((RE.Type == ELF::R_X86_64_32 && !(Value & 0xFFFFFFFF00000000ULL)) ||
-           (RE.Type == ELF::R_X86_64_32S &&
+    assert((Type == ELF::R_X86_64_32 && !(Value & 0xFFFFFFFF00000000ULL)) ||
+           (Type == ELF::R_X86_64_32S &&
             (Value & 0xFFFFFFFF00000000ULL) == 0xFFFFFFFF00000000ULL));
     uint32_t TruncatedAddr = (Value & 0xFFFFFFFF);
-    uint32_t *Target = reinterpret_cast<uint32_t*>(TargetAddr);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(LocalAddress);
     *Target = TruncatedAddr;
     break;
   }
   case ELF::R_X86_64_PC32: {
-    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(TargetAddr);
-    uint64_t RealOffset = *Placeholder +
-                           reinterpret_cast<uint64_t>(Addr) +
-                           RE.Addend - reinterpret_cast<uint64_t>(TargetAddr);
-    assert((RealOffset & 0xFFFFFFFF) == RealOffset);
-    uint32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
+    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(LocalAddress);
+    int64_t RealOffset = *Placeholder + Value + Addend - FinalAddress;
+    assert(RealOffset <= 214783647 && RealOffset >= -214783648);
+    int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
     *Placeholder = TruncOffset;
     break;
   }
   }
 }
 
-void RuntimeDyldELF::resolveX86Relocation(StringRef Name,
-                                          uint8_t *Addr,
-                                          const RelocationEntry &RE) {
-  uint8_t *TargetAddr;
-  if (RE.IsFunctionRelative) {
-    StringMap<SymbolLoc>::const_iterator Loc = SymbolTable.find(RE.Target);
-    assert(Loc != SymbolTable.end() && "Function for relocation not found");
-    TargetAddr =
-      reinterpret_cast<uint8_t*>(Sections[Loc->second.first].base()) +
-      Loc->second.second + RE.Offset;
-  } else {
-    // FIXME: Get the address of the target section and add that to RE.Offset
-    llvm_unreachable("Non-function relocation not implemented yet!");
-  }
-
-  switch (RE.Type) {
+void RuntimeDyldELF::resolveX86Relocation(uint8_t *LocalAddress,
+                                          uint32_t FinalAddress,
+                                          uint32_t Value,
+                                          uint32_t Type,
+                                          int32_t Addend) {
+  switch (Type) {
   case ELF::R_386_32: {
-    uint8_t **Target = reinterpret_cast<uint8_t**>(TargetAddr);
-    *Target = Addr + RE.Addend;
+    uint32_t *Target = (uint32_t*)(LocalAddress);
+    *Target = Value + Addend;
     break;
   }
   case ELF::R_386_PC32: {
-    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(TargetAddr);
-    uint32_t RealOffset = *Placeholder + reinterpret_cast<uintptr_t>(Addr) +
-                           RE.Addend - reinterpret_cast<uintptr_t>(TargetAddr);
+    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(LocalAddress);
+    uint32_t RealOffset = *Placeholder + Value + Addend - FinalAddress;
     *Placeholder = RealOffset;
     break;
     }
@@ -248,57 +84,173 @@ void RuntimeDyldELF::resolveX86Relocation(StringRef Name,
       // There are other relocation types, but it appears these are the
       //  only ones currently used by the LLVM ELF object writer
       llvm_unreachable("Relocation type not implemented yet!");
+      break;
   }
 }
 
-void RuntimeDyldELF::resolveArmRelocation(StringRef Name,
-                                          uint8_t *Addr,
-                                          const RelocationEntry &RE) {
+void RuntimeDyldELF::resolveARMRelocation(uint8_t *LocalAddress,
+                                          uint32_t FinalAddress,
+                                          uint32_t Value,
+                                          uint32_t Type,
+                                          int32_t Addend) {
+  // TODO: Add Thumb relocations.
+  uint32_t* TargetPtr = (uint32_t*)LocalAddress;
+  Value += Addend;
+
+  DEBUG(dbgs() << "resolveARMRelocation, LocalAddress: " << LocalAddress
+               << " FinalAddress: " << format("%p",FinalAddress)
+               << " Value: " << format("%x",Value)
+               << " Type: " << format("%x",Type)
+               << " Addend: " << format("%x",Addend)
+               << "\n");
+
+  switch(Type) {
+  default:
+    llvm_unreachable("Not implemented relocation type!");
+
+  // Just write 32bit value to relocation address
+  case ELF::R_ARM_ABS32 :
+    *TargetPtr = Value;
+    break;
+
+  // Write first 16 bit of 32 bit value to the mov instruction.
+  // Last 4 bit should be shifted.
+  case ELF::R_ARM_MOVW_ABS_NC :
+    Value = Value & 0xFFFF;
+    *TargetPtr |= Value & 0xFFF;
+    *TargetPtr |= ((Value >> 12) & 0xF) << 16;
+    break;
+
+  // Write last 16 bit of 32 bit value to the mov instruction.
+  // Last 4 bit should be shifted.
+  case ELF::R_ARM_MOVT_ABS :
+    Value = (Value >> 16) & 0xFFFF;
+    *TargetPtr |= Value & 0xFFF;
+    *TargetPtr |= ((Value >> 12) & 0xF) << 16;
+    break;
+
+  // Write 24 bit relative value to the branch instruction.
+  case ELF::R_ARM_PC24 :    // Fall through.
+  case ELF::R_ARM_CALL :    // Fall through.
+  case ELF::R_ARM_JUMP24 :
+    int32_t RelValue = static_cast<int32_t>(Value - FinalAddress - 8);
+    RelValue = (RelValue & 0x03FFFFFC) >> 2;
+    *TargetPtr &= 0xFF000000;
+    *TargetPtr |= RelValue;
+    break;
+  }
 }
 
-void RuntimeDyldELF::resolveRelocation(StringRef Name,
-                                       uint8_t *Addr,
-                                       const RelocationEntry &RE) {
+void RuntimeDyldELF::resolveRelocation(uint8_t *LocalAddress,
+                                       uint64_t FinalAddress,
+                                       uint64_t Value,
+                                       uint32_t Type,
+                                       int64_t Addend) {
   switch (Arch) {
   case Triple::x86_64:
-    resolveX86_64Relocation(Name, Addr, RE);
+    resolveX86_64Relocation(LocalAddress, FinalAddress, Value, Type, Addend);
     break;
   case Triple::x86:
-    resolveX86Relocation(Name, Addr, RE);
+    resolveX86Relocation(LocalAddress, (uint32_t)(FinalAddress & 0xffffffffL),
+                         (uint32_t)(Value & 0xffffffffL), Type,
+                         (uint32_t)(Addend & 0xffffffffL));
     break;
-  case Triple::arm:
-    resolveArmRelocation(Name, Addr, RE);
+  case Triple::arm:    // Fall through.
+  case Triple::thumb:
+    resolveARMRelocation(LocalAddress, (uint32_t)(FinalAddress & 0xffffffffL),
+                         (uint32_t)(Value & 0xffffffffL), Type,
+                         (uint32_t)(Addend & 0xffffffffL));
     break;
   default: llvm_unreachable("Unsupported CPU type!");
   }
 }
 
-void RuntimeDyldELF::reassignSymbolAddress(StringRef Name, uint8_t *Addr) {
-  // FIXME: deprecated. switch to reassignSectionAddress() instead.
-  //
-  // Actually moving the symbol address requires by-section mapping.
-  assert(Sections[SymbolTable.lookup(Name).first].base() == (void*)Addr &&
-         "Unable to relocate section in by-function JIT allocation model!");
-
-  RelocationList &Relocs = Relocations[Name];
-  for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
-    RelocationEntry &RE = Relocs[i];
-    resolveRelocation(Name, Addr, RE);
+void RuntimeDyldELF::
+processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
+                     ObjSectionToIDMap &ObjSectionToID,
+                     LocalSymbolMap &Symbols, StubMap &Stubs) {
+
+  uint32_t RelType = (uint32_t)(Rel.Type & 0xffffffffL);
+  intptr_t Addend = (intptr_t)Rel.AdditionalInfo;
+  RelocationValueRef Value;
+  StringRef TargetName;
+  const SymbolRef &Symbol = Rel.Symbol;
+  Symbol.getName(TargetName);
+  DEBUG(dbgs() << "\t\tRelType: " << RelType
+               << " Addend: " << Addend
+               << " TargetName: " << TargetName
+               << "\n");
+  // First look the symbol in object file symbols.
+  LocalSymbolMap::iterator it = Symbols.find(TargetName.data());
+  if (it != Symbols.end()) {
+    Value.SectionID = it->second.first;
+    Value.Addend = it->second.second;
+  } else {
+    // Second look the symbol in global symbol table.
+    StringMap<SymbolLoc>::iterator itS = SymbolTable.find(TargetName.data());
+    if (itS != SymbolTable.end()) {
+      Value.SectionID = itS->second.first;
+      Value.Addend = itS->second.second;
+    } else {
+      SymbolRef::Type SymType;
+      Symbol.getType(SymType);
+      switch (SymType) {
+        case SymbolRef::ST_Debug: {
+          // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
+          // and can be changed by another developers. Maybe best way is add
+          // a new symbol type ST_Section to SymbolRef and use it.
+          section_iterator sIt = Obj.end_sections();
+          Symbol.getSection(sIt);
+          if (sIt == Obj.end_sections())
+            llvm_unreachable("Symbol section not found, bad object file format!");
+          DEBUG(dbgs() << "\t\tThis is section symbol\n");
+          Value.SectionID = findOrEmitSection((*sIt), true, ObjSectionToID);
+          Value.Addend = Addend;
+          break;
+        }
+        case SymbolRef::ST_Unknown: {
+          Value.SymbolName = TargetName.data();
+          Value.Addend = Addend;
+          break;
+        }
+        default:
+          llvm_unreachable("Unresolved symbol type!");
+          break;
+      }
+    }
   }
-}
-
-// Assign an address to a symbol name and resolve all the relocations
-// associated with it.
-void RuntimeDyldELF::reassignSectionAddress(unsigned SectionID, uint64_t Addr) {
-  // The address to use for relocation resolution is not
-  // the address of the local section buffer. We must be doing
-  // a remote execution environment of some sort. Re-apply any
-  // relocations referencing this section with the given address.
-  //
-  // Addr is a uint64_t because we can't assume the pointer width
-  // of the target is the same as that of the host. Just use a generic
-  // "big enough" type.
-  assert(0);
+  DEBUG(dbgs() << "\t\tRel.SectionID: " << Rel.SectionID
+               << " Rel.Offset: " << Rel.Offset
+               << "\n");
+  if (Arch == Triple::arm &&
+      (RelType == ELF::R_ARM_PC24 ||
+       RelType == ELF::R_ARM_CALL ||
+       RelType == ELF::R_ARM_JUMP24)) {
+    // This is an ARM branch relocation, need to use a stub function.
+    DEBUG(dbgs() << "\t\tThis is an ARM branch relocation.");
+    SectionEntry &Section = Sections[Rel.SectionID];
+    uint8_t *Target = Section.Address + Rel.Offset;
+
+    //  Look up for existing stub.
+    StubMap::const_iterator stubIt = Stubs.find(Value);
+    if (stubIt != Stubs.end()) {
+      resolveRelocation(Target, Section.LoadAddress, (uint64_t)Section.Address +
+                        stubIt->second, RelType, 0);
+      DEBUG(dbgs() << " Stub function found\n");
+    } else {
+      // Create a new stub function.
+      DEBUG(dbgs() << " Create a new stub function\n");
+      Stubs[Value] = Section.StubOffset;
+      uint8_t *StubTargetAddr = createStubFunction(Section.Address +
+                                                   Section.StubOffset);
+      AddRelocation(Value, Rel.SectionID,
+                    StubTargetAddr - Section.Address, ELF::R_ARM_ABS32);
+      resolveRelocation(Target, Section.LoadAddress, (uint64_t)Section.Address +
+                        Section.StubOffset, RelType, 0);
+      Section.StubOffset += getMaxStubSize();
+    }
+  } else
+    AddRelocation(Value, Rel.SectionID, Rel.Offset, RelType);
 }
 
 bool RuntimeDyldELF::isCompatibleFormat(const MemoryBuffer *InputBuffer) const {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index e0f7d54..36566da 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -21,158 +21,42 @@ using namespace llvm;
 
 namespace llvm {
 class RuntimeDyldELF : public RuntimeDyldImpl {
-    // For each symbol, keep a list of relocations based on it. Anytime
-    // its address is reassigned (the JIT re-compiled the function, e.g.),
-    // the relocations get re-resolved.
-    struct RelocationEntry {
-      // Function or section this relocation is contained in.
-      std::string Target;
-      // Offset into the target function or section for the relocation.
-      uint32_t    Offset;
-      // Relocation type
-      uint32_t    Type;
-      // Addend encoded in the instruction itself, if any.
-      int32_t     Addend;
-      // Has the relocation been recalcuated as an offset within a function?
-      bool        IsFunctionRelative;
-      // Has this relocation been resolved previously?
-      bool        isResolved;
-
-      RelocationEntry(StringRef t,
-                      uint32_t offset,
-                      uint32_t type,
-                      int32_t addend,
-                      bool isFunctionRelative)
-        : Target(t)
-        , Offset(offset)
-        , Type(type)
-        , Addend(addend)
-        , IsFunctionRelative(isFunctionRelative)
-        , isResolved(false) { }
-    };
-    typedef SmallVector<RelocationEntry, 4> RelocationList;
-    StringMap<RelocationList> Relocations;
-    unsigned Arch;
-
-    void resolveRelocations();
-
-    void resolveX86_64Relocation(StringRef Name,
-                                 uint8_t *Addr,
-                                 const RelocationEntry &RE);
-
-    void resolveX86Relocation(StringRef Name,
-                              uint8_t *Addr,
-                              const RelocationEntry &RE);
-
-    void resolveArmRelocation(StringRef Name,
-                              uint8_t *Addr,
-                              const RelocationEntry &RE);
-
-    void resolveRelocation(StringRef Name,
-                           uint8_t *Addr,
-                           const RelocationEntry &RE);
-
-public:
-  RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
-
-  bool loadObject(MemoryBuffer *InputBuffer);
-
-  void reassignSymbolAddress(StringRef Name, uint8_t *Addr);
-  void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
-
-  bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const;
-};
-
-} // end namespace llvm
-
-#endif 
-
-//===-- RuntimeDyldELF.h - Run-time dynamic linker for MC-JIT ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// ELF support for MC-JIT runtime dynamic linker.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_RUNTIME_DYLD_ELF_H
-#define LLVM_RUNTIME_DYLD_ELF_H
-
-#include "RuntimeDyldImpl.h"
-
-using namespace llvm;
-
-
-namespace llvm {
-class RuntimeDyldELF : public RuntimeDyldImpl {
-    // For each symbol, keep a list of relocations based on it. Anytime
-    // its address is reassigned (the JIT re-compiled the function, e.g.),
-    // the relocations get re-resolved.
-    struct RelocationEntry {
-      // Function or section this relocation is contained in.
-      std::string Target;
-      // Offset into the target function or section for the relocation.
-      uint32_t    Offset;
-      // Relocation type
-      uint32_t    Type;
-      // Addend encoded in the instruction itself, if any.
-      int32_t     Addend;
-      // Has the relocation been recalcuated as an offset within a function?
-      bool        IsFunctionRelative;
-      // Has this relocation been resolved previously?
-      bool        isResolved;
-
-      RelocationEntry(StringRef t,
-                      uint32_t offset,
-                      uint32_t type,
-                      int32_t addend,
-                      bool isFunctionRelative)
-        : Target(t)
-        , Offset(offset)
-        , Type(type)
-        , Addend(addend)
-        , IsFunctionRelative(isFunctionRelative)
-        , isResolved(false) { }
-    };
-    typedef SmallVector<RelocationEntry, 4> RelocationList;
-    StringMap<RelocationList> Relocations;
-    unsigned Arch;
-
-    void resolveRelocations();
-
-    void resolveX86_64Relocation(StringRef Name,
-                                 uint8_t *Addr,
-                                 const RelocationEntry &RE);
-
-    void resolveX86Relocation(StringRef Name,
-                              uint8_t *Addr,
-                              const RelocationEntry &RE);
-
-    void resolveArmRelocation(StringRef Name,
-                              uint8_t *Addr,
-                              const RelocationEntry &RE);
-
-    void resolveRelocation(StringRef Name,
-                           uint8_t *Addr,
-                           const RelocationEntry &RE);
+protected:
+  void resolveX86_64Relocation(uint8_t *LocalAddress,
+                               uint64_t FinalAddress,
+                               uint64_t Value,
+                               uint32_t Type,
+                               int64_t Addend);
+
+  void resolveX86Relocation(uint8_t *LocalAddress,
+                            uint32_t FinalAddress,
+                            uint32_t Value,
+                            uint32_t Type,
+                            int32_t Addend);
+
+  void resolveARMRelocation(uint8_t *LocalAddress,
+                            uint32_t FinalAddress,
+                            uint32_t Value,
+                            uint32_t Type,
+                            int32_t Addend);
+
+  virtual void resolveRelocation(uint8_t *LocalAddress,
+                                 uint64_t FinalAddress,
+                                 uint64_t Value,
+                                 uint32_t Type,
+                                 int64_t Addend);
+
+  virtual void processRelocationRef(const ObjRelocationInfo &Rel,
+                                    const ObjectFile &Obj,
+                                    ObjSectionToIDMap &ObjSectionToID,
+                                    LocalSymbolMap &Symbols, StubMap &Stubs);
 
 public:
   RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
 
-  bool loadObject(MemoryBuffer *InputBuffer);
-
-  void reassignSymbolAddress(StringRef Name, uint8_t *Addr);
-  void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
-
   bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const;
 };
 
 } // end namespace llvm
 
-#endif 
-
+#endif
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 28e99be..d6430a9 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -15,45 +15,125 @@
 #define LLVM_RUNTIME_DYLD_IMPL_H
 
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/system_error.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/Triple.h"
+#include <map>
+#include "llvm/Support/Format.h"
 
 using namespace llvm;
+using namespace llvm::object;
 
 namespace llvm {
+
+class SectionEntry {
+public:
+  uint8_t* Address;
+  size_t Size;
+  uint64_t LoadAddress;   // For each section, the address it will be
+                          // considered to live at for relocations. The same
+                          // as the pointer to the above memory block for
+                          // hosted JITs.
+  uintptr_t StubOffset;   // It's used for architecturies with stub
+                          // functions for far relocations like ARM.
+  uintptr_t ObjAddress;   // Section address in object file. It's use for
+                          // calculate MachO relocation addend
+  SectionEntry(uint8_t* address, size_t size, uintptr_t stubOffset,
+               uintptr_t objAddress)
+    : Address(address), Size(size), LoadAddress((uintptr_t)address),
+      StubOffset(stubOffset), ObjAddress(objAddress) {}
+};
+
+class RelocationEntry {
+public:
+  unsigned    SectionID;  // Section the relocation is contained in.
+  uintptr_t   Offset;     // Offset into the section for the relocation.
+  uint32_t    Data;       // Relocatino data. Including type of relocation
+                          // and another flags and parameners from
+  intptr_t    Addend;     // Addend encoded in the instruction itself, if any,
+                          // plus the offset into the source section for
+                          // the symbol once the relocation is resolvable.
+  RelocationEntry(unsigned id, uint64_t offset, uint32_t data, int64_t addend)
+    : SectionID(id), Offset(offset), Data(data), Addend(addend) {}
+};
+
+// Raw relocation data from object file
+class ObjRelocationInfo {
+public:
+  unsigned  SectionID;
+  uint64_t  Offset;
+  SymbolRef Symbol;
+  uint64_t  Type;
+  int64_t   AdditionalInfo;
+};
+
+class RelocationValueRef {
+public:
+  unsigned  SectionID;
+  intptr_t  Addend;
+  const char *SymbolName;
+  RelocationValueRef(): SectionID(0), Addend(0), SymbolName(0) {}
+
+  inline bool operator==(const RelocationValueRef &Other) const {
+    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) == 0;
+  }
+  inline bool operator <(const RelocationValueRef &Other) const {
+    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) < 0;
+  }
+};
+
 class RuntimeDyldImpl {
 protected:
-  unsigned CPUType;
-  unsigned CPUSubtype;
-
   // The MemoryManager to load objects into.
   RTDyldMemoryManager *MemMgr;
 
-  // For each section, we have a MemoryBlock of it's data.
-  // Indexed by SectionID.
-  SmallVector<sys::MemoryBlock, 32> Sections;
-  // For each section, the address it will be considered to live at for
-  // relocations. The same as the pointer to the above memory block for hosted
-  // JITs. Indexed by SectionID.
-  SmallVector<uint64_t, 32> SectionLoadAddress;
+  // A list of emmitted sections.
+  typedef SmallVector<SectionEntry, 64> SectionList;
+  SectionList Sections;
 
-  // Keep a map of starting local address to the SectionID which references it.
-  // Lookup function for when we assign virtual addresses.
-  DenseMap<void *, unsigned> SectionLocalMemToID;
+  // Keep a map of sections from object file to the SectionID which
+  // references it.
+  typedef std::map<SectionRef, unsigned> ObjSectionToIDMap;
 
   // Master symbol table. As modules are loaded and external symbols are
   // resolved, their addresses are stored here as a SectionID/Offset pair.
-  typedef std::pair<unsigned, uint64_t> SymbolLoc;
+  typedef std::pair<unsigned, uintptr_t> SymbolLoc;
   StringMap<SymbolLoc> SymbolTable;
+  typedef DenseMap<const char*, SymbolLoc> LocalSymbolMap;
+
+  // For each symbol, keep a list of relocations based on it. Anytime
+  // its address is reassigned (the JIT re-compiled the function, e.g.),
+  // the relocations get re-resolved.
+  // The symbol (or section) the relocation is sourced from is the Key
+  // in the relocation list where it's stored.
+  typedef SmallVector<RelocationEntry, 64> RelocationList;
+  // Relocations to sections already loaded. Indexed by SectionID which is the
+  // source of the address. The target where the address will be writen is
+  // SectionID/Offset in the relocation itself.
+  DenseMap<unsigned, RelocationList> Relocations;
+  // Relocations to external symbols that are not yet resolved.
+  // Indexed by symbol name.
+  StringMap<RelocationList> SymbolRelocations;
+
+  typedef std::map<RelocationValueRef, uintptr_t> StubMap;
+
+  Triple::ArchType Arch;
+
+  inline unsigned getMaxStubSize() {
+    if (Arch == Triple::arm || Arch == Triple::thumb)
+      return 8; // 32-bit instruction and 32-bit address
+    else
+      return 0;
+  }
 
   bool HasError;
   std::string ErrorStr;
@@ -66,17 +146,62 @@ protected:
   }
 
   uint8_t *getSectionAddress(unsigned SectionID) {
-    return (uint8_t*)Sections[SectionID].base();
+    return (uint8_t*)Sections[SectionID].Address;
   }
-  void extractFunction(StringRef Name, uint8_t *StartAddress,
-                       uint8_t *EndAddress);
 
+  /// \brief Emits section data from the object file to the MemoryManager.
+  /// \param IsCode if it's true then allocateCodeSection() will be
+  ///        used for emmits, else allocateDataSection() will be used.
+  /// \return SectionID.
+  unsigned emitSection(const SectionRef &Section, bool IsCode);
+
+  /// \brief Find Section in LocalSections. If the secton is not found - emit
+  ///        it and store in LocalSections.
+  /// \param IsCode if it's true then allocateCodeSection() will be
+  ///        used for emmits, else allocateDataSection() will be used.
+  /// \return SectionID.
+  unsigned findOrEmitSection(const SectionRef &Section, bool IsCode,
+                             ObjSectionToIDMap &LocalSections);
+
+  /// \brief If Value.SymbolName is NULL then store relocation to the
+  ///        Relocations, else store it in the SymbolRelocations.
+  void AddRelocation(const RelocationValueRef &Value, unsigned SectionID,
+                     uintptr_t Offset, uint32_t RelType);
+
+  /// \brief Emits long jump instruction to Addr.
+  /// \return Pointer to the memory area for emitting target address.
+  uint8_t* createStubFunction(uint8_t *Addr);
+
+  /// \brief Resolves relocations from Relocs list with address from Value.
+  void resolveRelocationList(const RelocationList &Relocs, uint64_t Value);
+  void resolveRelocationEntry(const RelocationEntry &RE, uint64_t Value);
+
+  /// \brief A object file specific relocation resolver
+  /// \param Address Address to apply the relocation action
+  /// \param Value Target symbol address to apply the relocation action
+  /// \param Type object file specific relocation type
+  /// \param Addend A constant addend used to compute the value to be stored
+  ///        into the relocatable field
+  virtual void resolveRelocation(uint8_t *LocalAddress,
+                                 uint64_t FinalAddress,
+                                 uint64_t Value,
+                                 uint32_t Type,
+                                 int64_t Addend) = 0;
+
+  /// \brief Parses the object file relocation and store it to Relocations
+  ///        or SymbolRelocations. Its depend from object file type.
+  virtual void processRelocationRef(const ObjRelocationInfo &Rel,
+                                    const ObjectFile &Obj,
+                                    ObjSectionToIDMap &ObjSectionToID,
+                                    LocalSymbolMap &Symbols, StubMap &Stubs) = 0;
+
+  void resolveSymbols();
 public:
   RuntimeDyldImpl(RTDyldMemoryManager *mm) : MemMgr(mm), HasError(false) {}
 
   virtual ~RuntimeDyldImpl();
 
-  virtual bool loadObject(MemoryBuffer *InputBuffer) = 0;
+  bool loadObject(const MemoryBuffer *InputBuffer);
 
   void *getSymbolAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
@@ -87,9 +212,9 @@ public:
     return getSectionAddress(Loc.first) + Loc.second;
   }
 
-  virtual void resolveRelocations();
+  void resolveRelocations();
 
-  virtual void reassignSectionAddress(unsigned SectionID, uint64_t Addr) = 0;
+  void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
 
   void mapSectionAddress(void *LocalAddress, uint64_t TargetAddress);
 
@@ -103,6 +228,7 @@ public:
   StringRef getErrorString() { return ErrorStr; }
 
   virtual bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const = 0;
+
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index c11b2c3..24437e0 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -21,30 +21,64 @@ using namespace llvm::object;
 
 namespace llvm {
 
-bool RuntimeDyldMachO::
-resolveRelocation(uint8_t *Address, uint64_t Value, bool isPCRel,
-                  unsigned Type, unsigned Size, int64_t Addend) {
+void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
+                                         uint64_t FinalAddress,
+                                         uint64_t Value,
+                                         uint32_t Type,
+                                         int64_t Addend) {
+  bool isPCRel = (Type >> 24) & 1;
+  unsigned MachoType = (Type >> 28) & 0xf;
+  unsigned Size = 1 << ((Type >> 25) & 3);
+
+  DEBUG(dbgs() << "resolveRelocation LocalAddress: " << format("%p", LocalAddress)
+        << " FinalAddress: " << format("%p", FinalAddress)
+        << " Value: " << format("%p", Value)
+        << " Addend: " << Addend
+        << " isPCRel: " << isPCRel
+        << " MachoType: " << MachoType
+        << " Size: " << Size
+        << "\n");
+
   // This just dispatches to the proper target specific routine.
-  switch (CPUType) {
+  switch (Arch) {
   default: llvm_unreachable("Unsupported CPU type!");
-  case mach::CTM_x86_64:
-    return resolveX86_64Relocation((uintptr_t)Address, (uintptr_t)Value,
-                                   isPCRel, Type, Size, Addend);
-  case mach::CTM_ARM:
-    return resolveARMRelocation((uintptr_t)Address, (uintptr_t)Value,
-                                isPCRel, Type, Size, Addend);
+  case Triple::x86_64: // Fall through.
+  case Triple::x86:
+    resolveX86_64Relocation(LocalAddress,
+                            FinalAddress,
+                            (uintptr_t)Value,
+                            isPCRel,
+                            MachoType,
+                            Size,
+                            Addend);
+    break;
+  case Triple::arm:    // Fall through.
+  case Triple::thumb:
+    resolveARMRelocation(LocalAddress,
+                         FinalAddress,
+                         (uintptr_t)Value,
+                         isPCRel,
+                         MachoType,
+                         Size,
+                         Addend);
+    break;
   }
 }
 
 bool RuntimeDyldMachO::
-resolveX86_64Relocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
-                        unsigned Type, unsigned Size, int64_t Addend) {
+resolveX86_64Relocation(uint8_t *LocalAddress,
+                        uint64_t FinalAddress,
+                        uint64_t Value,
+                        bool isPCRel,
+                        unsigned Type,
+                        unsigned Size,
+                        int64_t Addend) {
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
   if (isPCRel)
     // FIXME: It seems this value needs to be adjusted by 4 for an effective PC
     // address. Is that expected? Only for branches, perhaps?
-    Value -= Address + 4;
+    Value -= FinalAddress + 4;
 
   switch(Type) {
   default:
@@ -58,7 +92,7 @@ resolveX86_64Relocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
     Value += Addend;
     // Mask in the target value a byte at a time (we don't have an alignment
     // guarantee for the target address, so this is safest).
-    uint8_t *p = (uint8_t*)Address;
+    uint8_t *p = (uint8_t*)LocalAddress;
     for (unsigned i = 0; i < Size; ++i) {
       *p++ = (uint8_t)Value;
       Value >>= 8;
@@ -74,12 +108,17 @@ resolveX86_64Relocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
 }
 
 bool RuntimeDyldMachO::
-resolveARMRelocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
-                     unsigned Type, unsigned Size, int64_t Addend) {
+resolveARMRelocation(uint8_t *LocalAddress,
+                     uint64_t FinalAddress,
+                     uint64_t Value,
+                     bool isPCRel,
+                     unsigned Type,
+                     unsigned Size,
+                     int64_t Addend) {
   // If the relocation is PC-relative, the value to be encoded is the
   // pointer difference.
   if (isPCRel) {
-    Value -= Address;
+    Value -= FinalAddress;
     // ARM PCRel relocations have an effective-PC offset of two instructions
     // (four bytes in Thumb mode, 8 bytes in ARM mode).
     // FIXME: For now, assume ARM mode.
@@ -92,7 +131,7 @@ resolveARMRelocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
   case macho::RIT_Vanilla: {
     // Mask in the target value a byte at a time (we don't have an alignment
     // guarantee for the target address, so this is safest).
-    uint8_t *p = (uint8_t*)Address;
+    uint8_t *p = (uint8_t*)LocalAddress;
     for (unsigned i = 0; i < Size; ++i) {
       *p++ = (uint8_t)Value;
       Value >>= 8;
@@ -102,7 +141,7 @@ resolveARMRelocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
   case macho::RIT_ARM_Branch24Bit: {
     // Mask the value into the target address. We know instructions are
     // 32-bit aligned, so we can do it all at once.
-    uint32_t *p = (uint32_t*)Address;
+    uint32_t *p = (uint32_t*)LocalAddress;
     // The low two bits of the value are not encoded.
     Value >>= 2;
     // Mask the value to 24 bits.
@@ -128,463 +167,83 @@ resolveARMRelocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
   return false;
 }
 
-bool RuntimeDyldMachO::
-loadSegment32(const MachOObject *Obj,
-              const MachOObject::LoadCommandInfo *SegmentLCI,
-              const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC) {
-  // FIXME: This should really be combined w/ loadSegment64. Templatized
-  // function on the 32/64 datatypes maybe?
-  InMemoryStruct<macho::SegmentLoadCommand> SegmentLC;
-  Obj->ReadSegmentLoadCommand(*SegmentLCI, SegmentLC);
-  if (!SegmentLC)
-    return Error("unable to load segment load command");
-
-
-  SmallVector<unsigned, 16> SectionMap;
-  for (unsigned SectNum = 0; SectNum != SegmentLC->NumSections; ++SectNum) {
-    InMemoryStruct<macho::Section> Sect;
-    Obj->ReadSection(*SegmentLCI, SectNum, Sect);
-    if (!Sect)
-      return Error("unable to load section: '" + Twine(SectNum) + "'");
-
-    // Allocate memory via the MM for the section.
-    uint8_t *Buffer;
-    uint32_t SectionID = Sections.size();
-    if (Sect->Flags == 0x80000400)
-      Buffer = MemMgr->allocateCodeSection(Sect->Size, Sect->Align, SectionID);
-    else
-      Buffer = MemMgr->allocateDataSection(Sect->Size, Sect->Align, SectionID);
-
-    DEBUG(dbgs() << "Loading "
-                 << ((Sect->Flags == 0x80000400) ? "text" : "data")
-                 << " (ID #" << SectionID << ")"
-                 << " '" << Sect->SegmentName << ","
-                 << Sect->Name << "' of size " << Sect->Size
-                 << " to address " << Buffer << ".\n");
-
-    // Copy the payload from the object file into the allocated buffer.
-    uint8_t *Base = (uint8_t*)Obj->getData(SegmentLC->FileOffset,
-                                           SegmentLC->FileSize).data();
-    memcpy(Buffer, Base + Sect->Address, Sect->Size);
-
-    // Remember what got allocated for this SectionID.
-    Sections.push_back(sys::MemoryBlock(Buffer, Sect->Size));
-    SectionLocalMemToID[Buffer] = SectionID;
-
-    // By default, the load address of a section is its memory buffer.
-    SectionLoadAddress.push_back((uint64_t)Buffer);
-
-    // Keep a map of object file section numbers to corresponding SectionIDs
-    // while processing the file.
-    SectionMap.push_back(SectionID);
-  }
-
-  // Process the symbol table.
-  SmallVector<StringRef, 64> SymbolNames;
-  processSymbols32(Obj, SectionMap, SymbolNames, SymtabLC);
-
-  // Process the relocations for each section we're loading.
-  Relocations.grow(Relocations.size() + SegmentLC->NumSections);
-  for (unsigned SectNum = 0; SectNum != SegmentLC->NumSections; ++SectNum) {
-    InMemoryStruct<macho::Section> Sect;
-    Obj->ReadSection(*SegmentLCI, SectNum, Sect);
-    if (!Sect)
-      return Error("unable to load section: '" + Twine(SectNum) + "'");
-    for (unsigned j = 0; j != Sect->NumRelocationTableEntries; ++j) {
-      InMemoryStruct<macho::RelocationEntry> RE;
-      Obj->ReadRelocationEntry(Sect->RelocationTableOffset, j, RE);
-      if (RE->Word0 & macho::RF_Scattered)
-        return Error("NOT YET IMPLEMENTED: scattered relocations.");
-      // Word0 of the relocation is the offset into the section where the
-      // relocation should be applied. We need to translate that into an
-      // offset into a function since that's our atom.
-      uint32_t Offset = RE->Word0;
-      bool isExtern = (RE->Word1 >> 27) & 1;
-
-      // FIXME: Get the relocation addend from the target address.
-      // FIXME: VERY imporant for internal relocations.
-
-      // Figure out the source symbol of the relocation. If isExtern is true,
-      // this relocation references the symbol table, otherwise it references
-      // a section in the same object, numbered from 1 through NumSections
-      // (SectionBases is [0, NumSections-1]).
-      uint32_t SourceNum = RE->Word1 & 0xffffff; // 24-bit value
-      if (!isExtern) {
-        assert(SourceNum > 0 && "Invalid relocation section number!");
-        unsigned SectionID = SectionMap[SourceNum - 1];
-        unsigned TargetID = SectionMap[SectNum];
-        DEBUG(dbgs() << "Internal relocation at Section #"
-                     << TargetID << " + " << Offset
-                     << " from Section #"
-                     << SectionID << " (Word1: "
-                     << format("0x%x", RE->Word1) << ")\n");
-
-        // Store the relocation information. It will get resolved when
-        // the section addresses are assigned.
-        Relocations[SectionID].push_back(RelocationEntry(TargetID,
-                                                         Offset,
-                                                         RE->Word1,
-                                                         0 /*Addend*/));
-      } else {
-        StringRef SourceName = SymbolNames[SourceNum];
-
-        // Now store the relocation information. Associate it with the source
-        // symbol. Just add it to the unresolved list and let the general
-        // path post-load resolve it if we know where the symbol is.
-        UnresolvedRelocations[SourceName].push_back(RelocationEntry(SectNum,
-                                                                    Offset,
-                                                                    RE->Word1,
-                                                                 0 /*Addend*/));
-        DEBUG(dbgs() << "Relocation at Section #" << SectNum << " + " << Offset
-              << " from '" << SourceName << "(Word1: "
-              << format("0x%x", RE->Word1) << ")\n");
-      }
+void RuntimeDyldMachO::
+processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
+                     ObjSectionToIDMap &ObjSectionToID,
+                     LocalSymbolMap &Symbols, StubMap &Stubs) {
+
+  uint32_t RelType = (uint32_t) (Rel.Type & 0xffffffffL);
+  RelocationValueRef Value;
+  SectionEntry &Section = Sections[Rel.SectionID];
+  uint8_t *Target = Section.Address + Rel.Offset;
+
+  bool isExtern = (RelType >> 27) & 1;
+  if (isExtern) {
+    StringRef TargetName;
+    const SymbolRef &Symbol = Rel.Symbol;
+    Symbol.getName(TargetName);
+    // First look the symbol in object file symbols.
+    LocalSymbolMap::iterator it = Symbols.find(TargetName.data());
+    if (it != Symbols.end()) {
+      Value.SectionID = it->second.first;
+      Value.Addend = it->second.second;
+    } else {
+      // Second look the symbol in global symbol table.
+      StringMap<SymbolLoc>::iterator itS = SymbolTable.find(TargetName.data());
+      if (itS != SymbolTable.end()) {
+        Value.SectionID = itS->second.first;
+        Value.Addend = itS->second.second;
+      } else
+        Value.SymbolName = TargetName.data();
     }
-  }
-
-  // Resolve the addresses of any symbols that were defined in this segment.
-  for (int i = 0, e = SymbolNames.size(); i != e; ++i)
-    resolveSymbol(SymbolNames[i]);
-
-  return false;
-}
-
-
-bool RuntimeDyldMachO::
-loadSegment64(const MachOObject *Obj,
-              const MachOObject::LoadCommandInfo *SegmentLCI,
-              const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC) {
-  InMemoryStruct<macho::Segment64LoadCommand> Segment64LC;
-  Obj->ReadSegment64LoadCommand(*SegmentLCI, Segment64LC);
-  if (!Segment64LC)
-    return Error("unable to load segment load command");
-
-
-  SmallVector<unsigned, 16> SectionMap;
-  for (unsigned SectNum = 0; SectNum != Segment64LC->NumSections; ++SectNum) {
-    InMemoryStruct<macho::Section64> Sect;
-    Obj->ReadSection64(*SegmentLCI, SectNum, Sect);
-    if (!Sect)
-      return Error("unable to load section: '" + Twine(SectNum) + "'");
-
-    // Allocate memory via the MM for the section.
-    uint8_t *Buffer;
-    uint32_t SectionID = Sections.size();
-    unsigned Align = 1 << Sect->Align; // .o file has log2 alignment.
-    if (Sect->Flags == 0x80000400)
-      Buffer = MemMgr->allocateCodeSection(Sect->Size, Align, SectionID);
-    else
-      Buffer = MemMgr->allocateDataSection(Sect->Size, Align, SectionID);
-
-    DEBUG(dbgs() << "Loading "
-                 << ((Sect->Flags == 0x80000400) ? "text" : "data")
-                 << " (ID #" << SectionID << ")"
-                 << " '" << Sect->SegmentName << ","
-                 << Sect->Name << "' of size " << Sect->Size
-                 << " (align " << Align << ")"
-                 << " to address " << Buffer << ".\n");
-
-    // Copy the payload from the object file into the allocated buffer.
-    uint8_t *Base = (uint8_t*)Obj->getData(Segment64LC->FileOffset,
-                                           Segment64LC->FileSize).data();
-    memcpy(Buffer, Base + Sect->Address, Sect->Size);
-
-    // Remember what got allocated for this SectionID.
-    Sections.push_back(sys::MemoryBlock(Buffer, Sect->Size));
-    SectionLocalMemToID[Buffer] = SectionID;
-
-    // By default, the load address of a section is its memory buffer.
-    SectionLoadAddress.push_back((uint64_t)Buffer);
-
-    // Keep a map of object file section numbers to corresponding SectionIDs
-    // while processing the file.
-    SectionMap.push_back(SectionID);
-  }
-
-  // Process the symbol table.
-  SmallVector<StringRef, 64> SymbolNames;
-  processSymbols64(Obj, SectionMap, SymbolNames, SymtabLC);
-
-  // Process the relocations for each section we're loading.
-  Relocations.grow(Relocations.size() + Segment64LC->NumSections);
-  for (unsigned SectNum = 0; SectNum != Segment64LC->NumSections; ++SectNum) {
-    InMemoryStruct<macho::Section64> Sect;
-    Obj->ReadSection64(*SegmentLCI, SectNum, Sect);
-    if (!Sect)
-      return Error("unable to load section: '" + Twine(SectNum) + "'");
-    for (unsigned j = 0; j != Sect->NumRelocationTableEntries; ++j) {
-      InMemoryStruct<macho::RelocationEntry> RE;
-      Obj->ReadRelocationEntry(Sect->RelocationTableOffset, j, RE);
-      if (RE->Word0 & macho::RF_Scattered)
-        return Error("NOT YET IMPLEMENTED: scattered relocations.");
-      // Word0 of the relocation is the offset into the section where the
-      // relocation should be applied. We need to translate that into an
-      // offset into a function since that's our atom.
-      uint32_t Offset = RE->Word0;
-      bool isExtern = (RE->Word1 >> 27) & 1;
-
-      // FIXME: Get the relocation addend from the target address.
-      // FIXME: VERY imporant for internal relocations.
-
-      // Figure out the source symbol of the relocation. If isExtern is true,
-      // this relocation references the symbol table, otherwise it references
-      // a section in the same object, numbered from 1 through NumSections
-      // (SectionBases is [0, NumSections-1]).
-      uint32_t SourceNum = RE->Word1 & 0xffffff; // 24-bit value
-      if (!isExtern) {
-        assert(SourceNum > 0 && "Invalid relocation section number!");
-        unsigned SectionID = SectionMap[SourceNum - 1];
-        unsigned TargetID = SectionMap[SectNum];
-        DEBUG(dbgs() << "Internal relocation at Section #"
-                     << TargetID << " + " << Offset
-                     << " from Section #"
-                     << SectionID << " (Word1: "
-                     << format("0x%x", RE->Word1) << ")\n");
-
-        // Store the relocation information. It will get resolved when
-        // the section addresses are assigned.
-        Relocations[SectionID].push_back(RelocationEntry(TargetID,
-                                                         Offset,
-                                                         RE->Word1,
-                                                         0 /*Addend*/));
-      } else {
-        StringRef SourceName = SymbolNames[SourceNum];
-
-        // Now store the relocation information. Associate it with the source
-        // symbol. Just add it to the unresolved list and let the general
-        // path post-load resolve it if we know where the symbol is.
-        UnresolvedRelocations[SourceName].push_back(RelocationEntry(SectNum,
-                                                                    Offset,
-                                                                    RE->Word1,
-                                                                 0 /*Addend*/));
-        DEBUG(dbgs() << "Relocation at Section #" << SectNum << " + " << Offset
-              << " from '" << SourceName << "(Word1: "
-              << format("0x%x", RE->Word1) << ")\n");
-      }
+  } else {
+    error_code err;
+    uint8_t sIdx = static_cast<uint8_t>(RelType & 0xFF);
+    section_iterator sIt = Obj.begin_sections(),
+                     sItEnd = Obj.end_sections();
+    for (uint8_t i = 1; i < sIdx; i++) {
+      error_code err;
+      sIt.increment(err);
+      if (sIt == sItEnd)
+        break;
     }
-  }
-
-  // Resolve the addresses of any symbols that were defined in this segment.
-  for (int i = 0, e = SymbolNames.size(); i != e; ++i)
-    resolveSymbol(SymbolNames[i]);
-
-  return false;
-}
-
-bool RuntimeDyldMachO::
-processSymbols32(const MachOObject *Obj,
-                 SmallVectorImpl<unsigned> &SectionMap,
-                 SmallVectorImpl<StringRef> &SymbolNames,
-                 const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC) {
-  // FIXME: Combine w/ processSymbols64. Factor 64/32 datatype and such.
-  for (unsigned i = 0; i != SymtabLC->NumSymbolTableEntries; ++i) {
-    InMemoryStruct<macho::SymbolTableEntry> STE;
-    Obj->ReadSymbolTableEntry(SymtabLC->SymbolTableOffset, i, STE);
-    if (!STE)
-      return Error("unable to read symbol: '" + Twine(i) + "'");
-    // Get the symbol name.
-    StringRef Name = Obj->getStringAtIndex(STE->StringIndex);
-    SymbolNames.push_back(Name);
-
-    // FIXME: Check the symbol type and flags.
-    if (STE->Type != 0xF)  // external, defined in this segment.
-      continue;
-    // Flags in the upper nibble we don't care about.
-    if ((STE->Flags & 0xf) != 0x0)
-      continue;
-
-    // Remember the symbol.
-    uint32_t SectionID = SectionMap[STE->SectionIndex - 1];
-    SymbolTable[Name] = SymbolLoc(SectionID, STE->Value);
-
-    DEBUG(dbgs() << "Symbol: '" << Name << "' @ "
-                 << (getSectionAddress(SectionID) + STE->Value)
-                 << "\n");
-  }
-  return false;
-}
-
-bool RuntimeDyldMachO::
-processSymbols64(const MachOObject *Obj,
-                 SmallVectorImpl<unsigned> &SectionMap,
-                 SmallVectorImpl<StringRef> &SymbolNames,
-                 const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC) {
-  for (unsigned i = 0; i != SymtabLC->NumSymbolTableEntries; ++i) {
-    InMemoryStruct<macho::Symbol64TableEntry> STE;
-    Obj->ReadSymbol64TableEntry(SymtabLC->SymbolTableOffset, i, STE);
-    if (!STE)
-      return Error("unable to read symbol: '" + Twine(i) + "'");
-    // Get the symbol name.
-    StringRef Name = Obj->getStringAtIndex(STE->StringIndex);
-    SymbolNames.push_back(Name);
-
-    // FIXME: Check the symbol type and flags.
-    if (STE->Type != 0xF)  // external, defined in this segment.
-      continue;
-    // Flags in the upper nibble we don't care about.
-    if ((STE->Flags & 0xf) != 0x0)
-      continue;
-
-    // Remember the symbol.
-    uint32_t SectionID = SectionMap[STE->SectionIndex - 1];
-    SymbolTable[Name] = SymbolLoc(SectionID, STE->Value);
-
-    DEBUG(dbgs() << "Symbol: '" << Name << "' @ "
-                 << (getSectionAddress(SectionID) + STE->Value)
-                 << "\n");
-  }
-  return false;
-}
-
-// resolveSymbol - Resolve any relocations to the specified symbol if
-// we know where it lives.
-void RuntimeDyldMachO::resolveSymbol(StringRef Name) {
-  StringMap<SymbolLoc>::const_iterator Loc = SymbolTable.find(Name);
-  if (Loc == SymbolTable.end())
-    return;
-
-  RelocationList &Relocs = UnresolvedRelocations[Name];
-  DEBUG(dbgs() << "Resolving symbol '" << Name << "'\n");
-  for (int i = 0, e = Relocs.size(); i != e; ++i) {
-    // Change the relocation to be section relative rather than symbol
-    // relative and move it to the resolved relocation list.
-    RelocationEntry Entry = Relocs[i];
-    Entry.Addend += Loc->second.second;
-    Relocations[Loc->second.first].push_back(Entry);
-  }
-  // FIXME: Keep a worklist of the relocations we've added so that we can
-  // resolve more selectively later.
-  Relocs.clear();
-}
-
-bool RuntimeDyldMachO::loadObject(MemoryBuffer *InputBuffer) {
-  // If the linker is in an error state, don't do anything.
-  if (hasError())
-    return true;
-  // Load the Mach-O wrapper object.
-  std::string ErrorStr;
-  OwningPtr<MachOObject> Obj(
-    MachOObject::LoadFromBuffer(InputBuffer, &ErrorStr));
-  if (!Obj)
-    return Error("unable to load object: '" + ErrorStr + "'");
-
-  // Get the CPU type information from the header.
-  const macho::Header &Header = Obj->getHeader();
-
-  // FIXME: Error checking that the loaded object is compatible with
-  //        the system we're running on.
-  CPUType = Header.CPUType;
-  CPUSubtype = Header.CPUSubtype;
-
-  // Validate that the load commands match what we expect.
-  const MachOObject::LoadCommandInfo *SegmentLCI = 0, *SymtabLCI = 0,
-    *DysymtabLCI = 0;
-  for (unsigned i = 0; i != Header.NumLoadCommands; ++i) {
-    const MachOObject::LoadCommandInfo &LCI = Obj->getLoadCommandInfo(i);
-    switch (LCI.Command.Type) {
-    case macho::LCT_Segment:
-    case macho::LCT_Segment64:
-      if (SegmentLCI)
-        return Error("unexpected input object (multiple segments)");
-      SegmentLCI = &LCI;
-      break;
-    case macho::LCT_Symtab:
-      if (SymtabLCI)
-        return Error("unexpected input object (multiple symbol tables)");
-      SymtabLCI = &LCI;
-      break;
-    case macho::LCT_Dysymtab:
-      if (DysymtabLCI)
-        return Error("unexpected input object (multiple symbol tables)");
-      DysymtabLCI = &LCI;
-      break;
-    default:
-      return Error("unexpected input object (unexpected load command");
+    assert(sIt != sItEnd && "No section containing relocation!");
+    Value.SectionID = findOrEmitSection(*sIt, true, ObjSectionToID);
+    Value.Addend = *(const intptr_t *)Target;
+    if (Value.Addend) {
+      // The MachO addend is offset from the current section, we need set it
+      // as offset from destination section
+      Value.Addend += Section.ObjAddress - Sections[Value.SectionID].ObjAddress;
     }
   }
 
-  if (!SymtabLCI)
-    return Error("no symbol table found in object");
-  if (!SegmentLCI)
-    return Error("no segments found in object");
-
-  // Read and register the symbol table data.
-  InMemoryStruct<macho::SymtabLoadCommand> SymtabLC;
-  Obj->ReadSymtabLoadCommand(*SymtabLCI, SymtabLC);
-  if (!SymtabLC)
-    return Error("unable to load symbol table load command");
-  Obj->RegisterStringTable(*SymtabLC);
-
-  // Read the dynamic link-edit information, if present (not present in static
-  // objects).
-  if (DysymtabLCI) {
-    InMemoryStruct<macho::DysymtabLoadCommand> DysymtabLC;
-    Obj->ReadDysymtabLoadCommand(*DysymtabLCI, DysymtabLC);
-    if (!DysymtabLC)
-      return Error("unable to load dynamic link-exit load command");
-
-    // FIXME: We don't support anything interesting yet.
-//    if (DysymtabLC->LocalSymbolsIndex != 0)
-//      return Error("NOT YET IMPLEMENTED: local symbol entries");
-//    if (DysymtabLC->ExternalSymbolsIndex != 0)
-//      return Error("NOT YET IMPLEMENTED: non-external symbol entries");
-//    if (DysymtabLC->UndefinedSymbolsIndex != SymtabLC->NumSymbolTableEntries)
-//      return Error("NOT YET IMPLEMENTED: undefined symbol entries");
-  }
-
-  // Load the segment load command.
-  if (SegmentLCI->Command.Type == macho::LCT_Segment) {
-    if (loadSegment32(Obj.get(), SegmentLCI, SymtabLC))
-      return true;
-  } else {
-    if (loadSegment64(Obj.get(), SegmentLCI, SymtabLC))
-      return true;
-  }
-
-  // Assign the addresses of the sections from the object so that any
-  // relocations to them get set properly.
-  // FIXME: This is done directly from the client at the moment. We should
-  // default the values to the local storage, at least when the target arch
-  // is the same as the host arch.
-
-  return false;
+  if (Arch == Triple::arm && RelType == macho::RIT_ARM_Branch24Bit) {
+    // This is an ARM branch relocation, need to use a stub function.
+
+    //  Look up for existing stub.
+    StubMap::const_iterator stubIt = Stubs.find(Value);
+    if (stubIt != Stubs.end())
+      resolveRelocation(Target, (uint64_t)Target,
+                        (uint64_t)Section.Address + stubIt->second,
+                        RelType, 0);
+    else {
+      // Create a new stub function.
+      Stubs[Value] = Section.StubOffset;
+      uint8_t *StubTargetAddr = createStubFunction(Section.Address +
+                                                   Section.StubOffset);
+      AddRelocation(Value, Rel.SectionID, StubTargetAddr - Section.Address,
+                    macho::RIT_Vanilla);
+      resolveRelocation(Target, (uint64_t)Target,
+                        (uint64_t)Section.Address + Section.StubOffset,
+                        RelType, 0);
+      Section.StubOffset += getMaxStubSize();
+    }
+  } else
+    AddRelocation(Value, Rel.SectionID, Rel.Offset, RelType);
 }
 
-// Assign an address to a symbol name and resolve all the relocations
-// associated with it.
-void RuntimeDyldMachO::reassignSectionAddress(unsigned SectionID,
-                                              uint64_t Addr) {
-  // The address to use for relocation resolution is not
-  // the address of the local section buffer. We must be doing
-  // a remote execution environment of some sort. Re-apply any
-  // relocations referencing this section with the given address.
-  //
-  // Addr is a uint64_t because we can't assume the pointer width
-  // of the target is the same as that of the host. Just use a generic
-  // "big enough" type.
-
-  SectionLoadAddress[SectionID] = Addr;
-
-  RelocationList &Relocs = Relocations[SectionID];
-  for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
-    RelocationEntry &RE = Relocs[i];
-    uint8_t *Target = (uint8_t*)Sections[RE.SectionID].base() + RE.Offset;
-    bool isPCRel = (RE.Data >> 24) & 1;
-    unsigned Type = (RE.Data >> 28) & 0xf;
-    unsigned Size = 1 << ((RE.Data >> 25) & 3);
-
-    DEBUG(dbgs() << "Resolving relocation at Section #" << RE.SectionID
-          << " + " << RE.Offset << " (" << format("%p", Target) << ")"
-          << " from Section #" << SectionID << " (" << format("%p", Addr) << ")"
-          << "(" << (isPCRel ? "pcrel" : "absolute")
-          << ", type: " << Type << ", Size: " << Size << ", Addend: "
-          << RE.Addend << ").\n");
-
-    resolveRelocation(Target, Addr, isPCRel, Type, Size, RE.Addend);
-  }
-}
 
-bool RuntimeDyldMachO::isKnownFormat(const MemoryBuffer *InputBuffer) {
+bool RuntimeDyldMachO::isCompatibleFormat(const MemoryBuffer *InputBuffer) const {
   StringRef Magic = InputBuffer->getBuffer().slice(0, 4);
   if (Magic == "\xFE\xED\xFA\xCE") return true;
   if (Magic == "\xCE\xFA\xED\xFE") return true;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 5798981..36b39dd 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -25,68 +25,37 @@ using namespace llvm::object;
 
 namespace llvm {
 class RuntimeDyldMachO : public RuntimeDyldImpl {
-
-  // For each symbol, keep a list of relocations based on it. Anytime
-  // its address is reassigned (the JIT re-compiled the function, e.g.),
-  // the relocations get re-resolved.
-  // The symbol (or section) the relocation is sourced from is the Key
-  // in the relocation list where it's stored.
-  struct RelocationEntry {
-    unsigned    SectionID;  // Section the relocation is contained in.
-    uint64_t    Offset;     // Offset into the section for the relocation.
-    uint32_t    Data;       // Second word of the raw macho relocation entry.
-    int64_t     Addend;     // Addend encoded in the instruction itself, if any,
-                            // plus the offset into the source section for
-                            // the symbol once the relocation is resolvable.
-
-    RelocationEntry(unsigned id, uint64_t offset, uint32_t data, int64_t addend)
-      : SectionID(id), Offset(offset), Data(data), Addend(addend) {}
-  };
-  typedef SmallVector<RelocationEntry, 4> RelocationList;
-  // Relocations to sections already loaded. Indexed by SectionID which is the
-  // source of the address. The target where the address will be writen is
-  // SectionID/Offset in the relocation itself.
-  IndexedMap<RelocationList> Relocations;
-  // Relocations to symbols that are not yet resolved. Must be external
-  // relocations by definition. Indexed by symbol name.
-  StringMap<RelocationList> UnresolvedRelocations;
-
-  bool resolveRelocation(uint8_t *Address, uint64_t Value, bool isPCRel,
-                         unsigned Type, unsigned Size, int64_t Addend);
-  bool resolveX86_64Relocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
-                               unsigned Type, unsigned Size, int64_t Addend);
-  bool resolveARMRelocation(uintptr_t Address, uintptr_t Value, bool isPCRel,
-                            unsigned Type, unsigned Size, int64_t Addend);
-
-  bool loadSegment32(const MachOObject *Obj,
-                     const MachOObject::LoadCommandInfo *SegmentLCI,
-                     const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC);
-  bool loadSegment64(const MachOObject *Obj,
-                     const MachOObject::LoadCommandInfo *SegmentLCI,
-                     const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC);
-  bool processSymbols32(const MachOObject *Obj,
-                      SmallVectorImpl<unsigned> &SectionMap,
-                      SmallVectorImpl<StringRef> &SymbolNames,
-                      const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC);
-  bool processSymbols64(const MachOObject *Obj,
-                      SmallVectorImpl<unsigned> &SectionMap,
-                      SmallVectorImpl<StringRef> &SymbolNames,
-                      const InMemoryStruct<macho::SymtabLoadCommand> &SymtabLC);
-
-  void resolveSymbol(StringRef Name);
+protected:
+  bool resolveX86_64Relocation(uint8_t *LocalAddress,
+                               uint64_t FinalAddress,
+                               uint64_t Value,
+                               bool isPCRel,
+                               unsigned Type,
+                               unsigned Size,
+                               int64_t Addend);
+  bool resolveARMRelocation(uint8_t *LocalAddress,
+                            uint64_t FinalAddress,
+                            uint64_t Value,
+                            bool isPCRel,
+                            unsigned Type,
+                            unsigned Size,
+                            int64_t Addend);
+
+  virtual void processRelocationRef(const ObjRelocationInfo &Rel,
+                                    const ObjectFile &Obj,
+                                    ObjSectionToIDMap &ObjSectionToID,
+                                    LocalSymbolMap &Symbols, StubMap &Stubs);
 
 public:
+  virtual void resolveRelocation(uint8_t *LocalAddress,
+                                 uint64_t FinalAddress,
+                                 uint64_t Value,
+                                 uint32_t Type,
+                                 int64_t Addend);
+                                 
   RuntimeDyldMachO(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
 
-  bool loadObject(MemoryBuffer *InputBuffer);
-
-  void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
-
-  static bool isKnownFormat(const MemoryBuffer *InputBuffer);
-
-  bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const {
-    return isKnownFormat(InputBuffer);
-  }
+  bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const;
 };
 
 } // end namespace llvm
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index bd5956f..ab5ddaf 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -1528,6 +1528,11 @@ bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
     }
     Lex();
   }
+  // If there weren't any arguments, erase the token vector so everything
+  // else knows that. Leaving around the vestigal empty token list confuses
+  // things.
+  if (MacroArguments.size() == 1 && MacroArguments.back().empty())
+    MacroArguments.clear();
 
   // Macro instantiation is lexical, unfortunately. We construct a new buffer
   // to hold the macro body with substitutions.
@@ -1624,6 +1629,8 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef) {
       return Error(EqualLoc, "Recursive use of '" + Name + "'");
     else if (Sym->isUndefined() && !Sym->isUsed() && !Sym->isVariable())
       ; // Allow redefinitions of undefined symbols only used in directives.
+    else if (Sym->isVariable() && !Sym->isUsed() && allow_redef)
+      ; // Allow redefinitions of variables that haven't yet been used.
     else if (!Sym->isUndefined() && (!Sym->isVariable() || !allow_redef))
       return Error(EqualLoc, "redefinition of '" + Name + "'");
     else if (!Sym->isVariable())
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index c2fad167..e013e77 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -54,17 +54,14 @@ const MCSymbol &MCSymbol::AliasedSymbol() const {
 void MCSymbol::setVariableValue(const MCExpr *Value) {
   assert(!IsUsed && "Cannot set a variable that has already been used.");
   assert(Value && "Invalid variable value!");
-  assert((isUndefined() || (isAbsolute() && isa<MCConstantExpr>(Value))) &&
-         "Invalid redefinition!");
   this->Value = Value;
 
   // Variables should always be marked as in the same "section" as the value.
   const MCSection *Section = Value->FindAssociatedSection();
-  if (Section) {
+  if (Section)
     setSection(*Section);
-  } else {
+  else
     setUndefined();
-  }
 }
 
 void MCSymbol::print(raw_ostream &OS) const {
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 7144e68..f706cac 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -783,9 +783,22 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
     }
 
     if (Sec->Relocations.size() > 0) {
-      Sec->Header.NumberOfRelocations = Sec->Relocations.size();
+      bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
+
+      if (RelocationsOverflow) {
+        // Signal overflow by setting NumberOfSections to max value. Actual
+        // size is found in reloc #0. Microsoft tools understand this.
+        Sec->Header.NumberOfRelocations = 0xffff;
+      } else {
+        Sec->Header.NumberOfRelocations = Sec->Relocations.size();
+      }
       Sec->Header.PointerToRelocations = offset;
 
+      if (RelocationsOverflow) {
+        // Reloc #0 will contain actual count, so make room for it.
+        offset += COFF::RelocationSize;
+      }
+
       offset += COFF::RelocationSize * Sec->Relocations.size();
 
       for (relocations::iterator cr = Sec->Relocations.begin(),
@@ -820,8 +833,12 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
     MCAssembler::const_iterator j, je;
 
     for (i = Sections.begin(), ie = Sections.end(); i != ie; i++)
-      if ((*i)->Number != -1)
+      if ((*i)->Number != -1) {
+        if ((*i)->Relocations.size() >= 0xffff) {
+          (*i)->Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
+        }
         WriteSectionHeader((*i)->Header);
+      }
 
     for (i = Sections.begin(), ie = Sections.end(),
          j = Asm.begin(), je = Asm.end();
@@ -841,6 +858,16 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
         assert(OS.tell() == (*i)->Header.PointerToRelocations &&
                "Section::PointerToRelocations is insane!");
 
+        if ((*i)->Relocations.size() >= 0xffff) {
+          // In case of overflow, write actual relocation count as first
+          // relocation. Including the synthetic reloc itself (+ 1).
+          COFF::relocation r;
+          r.VirtualAddress = (*i)->Relocations.size() + 1;
+          r.SymbolTableIndex = 0;
+          r.Type = 0;
+          WriteRelocation(r);
+        }
+
         for (relocations::const_iterator k = (*i)->Relocations.begin(),
                                                ke = (*i)->Relocations.end();
                                                k != ke; k++) {
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index b67377c..c5f15ba 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -174,7 +174,7 @@ error_code Archive::Child::getAsBinary(OwningPtr<Binary> &Result) const {
 }
 
 Archive::Archive(MemoryBuffer *source, error_code &ec)
-  : Binary(Binary::isArchive, source) {
+  : Binary(Binary::ID_Archive, source) {
   // Check for sufficient magic.
   if (!source || source->getBufferSize()
                  < (8 + sizeof(ArchiveMemberHeader) + 2) // Smallest archive.
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index a3fdd5b..b8ba905 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/COFF.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
@@ -300,24 +301,7 @@ error_code COFFObjectFile::getSectionNext(DataRefImpl Sec,
 error_code COFFObjectFile::getSectionName(DataRefImpl Sec,
                                           StringRef &Result) const {
   const coff_section *sec = toSec(Sec);
-  StringRef name;
-  if (sec->Name[7] == 0)
-    // Null terminated, let ::strlen figure out the length.
-    name = sec->Name;
-  else
-    // Not null terminated, use all 8 bytes.
-    name = StringRef(sec->Name, 8);
-
-  // Check for string table entry. First byte is '/'.
-  if (name[0] == '/') {
-    uint32_t Offset;
-    name.substr(1).getAsInteger(10, Offset);
-    if (error_code ec = getString(Offset, name))
-      return ec;
-  }
-
-  Result = name;
-  return object_error::success;
+  return getSectionName(sec, Result);
 }
 
 error_code COFFObjectFile::getSectionAddress(DataRefImpl Sec,
@@ -337,16 +321,10 @@ error_code COFFObjectFile::getSectionSize(DataRefImpl Sec,
 error_code COFFObjectFile::getSectionContents(DataRefImpl Sec,
                                               StringRef &Result) const {
   const coff_section *sec = toSec(Sec);
-  // The only thing that we need to verify is that the contents is contained
-  // within the file bounds. We don't need to make sure it doesn't cover other
-  // data, as there's nothing that says that is not allowed.
-  uintptr_t con_start = uintptr_t(base()) + sec->PointerToRawData;
-  uintptr_t con_end = con_start + sec->SizeOfRawData;
-  if (con_end > uintptr_t(Data->getBufferEnd()))
-    return object_error::parse_failed;
-  Result = StringRef(reinterpret_cast<const char*>(con_start),
-                     sec->SizeOfRawData);
-  return object_error::success;
+  ArrayRef<uint8_t> Res;
+  error_code EC = getSectionContents(sec, Res);
+  Result = StringRef(reinterpret_cast<const char*>(Res.data()), Res.size());
+  return EC;
 }
 
 error_code COFFObjectFile::getSectionAlignment(DataRefImpl Sec,
@@ -421,7 +399,7 @@ relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
 }
 
 COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
-  : ObjectFile(Binary::isCOFF, Object, ec)
+  : ObjectFile(Binary::ID_COFF, Object, ec)
   , Header(0)
   , SectionTable(0)
   , SymbolTable(0)
@@ -630,6 +608,43 @@ error_code COFFObjectFile::getSymbolName(const coff_symbol *symbol,
   return object_error::success;
 }
 
+error_code COFFObjectFile::getSectionName(const coff_section *Sec,
+                                          StringRef &Res) const {
+  StringRef Name;
+  if (Sec->Name[7] == 0)
+    // Null terminated, let ::strlen figure out the length.
+    Name = Sec->Name;
+  else
+    // Not null terminated, use all 8 bytes.
+    Name = StringRef(Sec->Name, 8);
+
+  // Check for string table entry. First byte is '/'.
+  if (Name[0] == '/') {
+    uint32_t Offset;
+    if (Name.substr(1).getAsInteger(10, Offset))
+      return object_error::parse_failed;
+    if (error_code ec = getString(Offset, Name))
+      return ec;
+  }
+
+  Res = Name;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSectionContents(const coff_section *Sec,
+                                              ArrayRef<uint8_t> &Res) const {
+  // The only thing that we need to verify is that the contents is contained
+  // within the file bounds. We don't need to make sure it doesn't cover other
+  // data, as there's nothing that says that is not allowed.
+  uintptr_t ConStart = uintptr_t(base()) + Sec->PointerToRawData;
+  uintptr_t ConEnd = ConStart + Sec->SizeOfRawData;
+  if (ConEnd > uintptr_t(Data->getBufferEnd()))
+    return object_error::parse_failed;
+  Res = ArrayRef<uint8_t>(reinterpret_cast<const unsigned char*>(ConStart),
+                          Sec->SizeOfRawData);
+  return object_error::success;
+}
+
 const coff_relocation *COFFObjectFile::toRel(DataRefImpl Rel) const {
   return reinterpret_cast<const coff_relocation*>(Rel.p);
 }
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 655c40a..819409e 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -30,7 +30,7 @@ namespace object {
 
 MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, MachOObject *MOO,
                                  error_code &ec)
-    : ObjectFile(Binary::isMachO, Object, ec),
+    : ObjectFile(Binary::ID_MachO, Object, ec),
       MachOObj(MOO),
       RegisteredStringTable(std::numeric_limits<uint32_t>::max()) {
   DataRefImpl DRI;
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 031bbb8..9b81fe7 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -457,16 +457,6 @@ APInt APInt::XorSlowCase(const APInt& RHS) const {
   return APInt(val, getBitWidth()).clearUnusedBits();
 }
 
-bool APInt::operator !() const {
-  if (isSingleWord())
-    return !VAL;
-
-  for (unsigned i = 0; i < getNumWords(); ++i)
-    if (pVal[i])
-      return false;
-  return true;
-}
-
 APInt APInt::operator*(const APInt& RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord())
@@ -494,12 +484,6 @@ APInt APInt::operator-(const APInt& RHS) const {
   return Result.clearUnusedBits();
 }
 
-bool APInt::operator[](unsigned bitPosition) const {
-  assert(bitPosition < getBitWidth() && "Bit position out of bounds!");
-  return (maskBit(bitPosition) &
-          (isSingleWord() ?  VAL : pVal[whichWord(bitPosition)])) != 0;
-}
-
 bool APInt::EqualSlowCase(const APInt& RHS) const {
   // Get some facts about the number of bits used in the two operands.
   unsigned n1 = getActiveBits();
@@ -722,20 +706,9 @@ unsigned APInt::countLeadingZerosSlowCase() const {
   return Count;
 }
 
-static unsigned countLeadingOnes_64(uint64_t V, unsigned skip) {
-  unsigned Count = 0;
-  if (skip)
-    V <<= skip;
-  while (V && (V & (1ULL << 63))) {
-    Count++;
-    V <<= 1;
-  }
-  return Count;
-}
-
 unsigned APInt::countLeadingOnes() const {
   if (isSingleWord())
-    return countLeadingOnes_64(VAL, APINT_BITS_PER_WORD - BitWidth);
+    return CountLeadingOnes_64(VAL << (APINT_BITS_PER_WORD - BitWidth));
 
   unsigned highWordBits = BitWidth % APINT_BITS_PER_WORD;
   unsigned shift;
@@ -746,13 +719,13 @@ unsigned APInt::countLeadingOnes() const {
     shift = APINT_BITS_PER_WORD - highWordBits;
   }
   int i = getNumWords() - 1;
-  unsigned Count = countLeadingOnes_64(pVal[i], shift);
+  unsigned Count = CountLeadingOnes_64(pVal[i] << shift);
   if (Count == highWordBits) {
     for (i--; i >= 0; --i) {
       if (pVal[i] == -1ULL)
         Count += APINT_BITS_PER_WORD;
       else {
-        Count += countLeadingOnes_64(pVal[i], 0);
+        Count += CountLeadingOnes_64(pVal[i]);
         break;
       }
     }
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index d1ec4b0..e6fdf16 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -1191,7 +1191,7 @@ printOptionNoValue(const Option &O, size_t GlobalWidth) const {
 static int OptNameCompare(const void *LHS, const void *RHS) {
   typedef std::pair<const char *, Option*> pair_ty;
 
-  return strcmp(((pair_ty*)LHS)->first, ((pair_ty*)RHS)->first);
+  return strcmp(((const pair_ty*)LHS)->first, ((const pair_ty*)RHS)->first);
 }
 
 // Copy Options into a vector so we can sort them as we like.
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index 0dba28a..32126ec 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -11,12 +11,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Config/config.h"
 using namespace llvm;
 
+static cl::opt<bool> ViewBackground("view-background", cl::Hidden,
+  cl::desc("Execute graph viewer in the background. Creates tmp file litter."));
+
 std::string llvm::DOT::EscapeString(const std::string &Label) {
   std::string Str(Label);
   for (unsigned i = 0; i != Str.length(); ++i)
@@ -49,10 +53,28 @@ std::string llvm::DOT::EscapeString(const std::string &Label) {
   return Str;
 }
 
-
+// Execute the graph viewer. Return true if successful.
+static bool LLVM_ATTRIBUTE_UNUSED
+ExecGraphViewer(const sys::Path &ExecPath, std::vector<const char*> &args,
+                const sys::Path &Filename, bool wait, std::string &ErrMsg) {
+  if (wait) {
+    if (sys::Program::ExecuteAndWait(ExecPath, &args[0],0,0,0,0,&ErrMsg)) {
+      errs() << "Error: " << ErrMsg << "\n";
+      return false;
+    }
+    Filename.eraseFromDisk();
+    errs() << " done. \n";
+  }
+  else {
+    sys::Program::ExecuteNoWait(ExecPath, &args[0],0,0,0,&ErrMsg);
+    errs() << "Remember to erase graph file: " << Filename.str() << "\n";
+  }
+  return true;
+}
 
 void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
                         GraphProgram::Name program) {
+  wait &= !ViewBackground;
   std::string ErrMsg;
 #if HAVE_GRAPHVIZ
   sys::Path Graphviz(LLVM_PATH_GRAPHVIZ);
@@ -61,14 +83,10 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   args.push_back(Graphviz.c_str());
   args.push_back(Filename.c_str());
   args.push_back(0);
-  
+
   errs() << "Running 'Graphviz' program... ";
-  if (sys::Program::ExecuteAndWait(Graphviz, &args[0],0,0,0,0,&ErrMsg)) {
-    errs() << "Error: " << ErrMsg << "\n";
+  if (!ExecGraphViewer(Graphviz, args, Filename, wait, ErrMsg))
     return;
-  }
-  Filename.eraseFromDisk();
-  errs() << " done. \n";
 
 #elif HAVE_XDOT_PY
   std::vector<const char*> args;
@@ -83,17 +101,12 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   case GraphProgram::CIRCO: args.push_back("-f"); args.push_back("circo");break;
   default: errs() << "Unknown graph layout name; using default.\n";
   }
-  
+
   args.push_back(0);
 
   errs() << "Running 'xdot.py' program... ";
-  if (sys::Program::ExecuteAndWait(sys::Path(LLVM_PATH_XDOT_PY),
-                                   &args[0],0,0,0,0,&ErrMsg)) {
-    errs() << "Error: " << ErrMsg << "\n";
+  if (!ExecGraphViewer(sys::Path(LLVM_PATH_XDOT_PY), args, Filename, wait, ErrMsg))
     return;
-  }
-  Filename.eraseFromDisk();
-  errs() << " done. \n";
 
 #elif (HAVE_GV && (HAVE_DOT || HAVE_FDP || HAVE_NEATO || \
                    HAVE_TWOPI || HAVE_CIRCO))
@@ -150,14 +163,11 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   args.push_back("-o");
   args.push_back(PSFilename.c_str());
   args.push_back(0);
-  
+
   errs() << "Running '" << prog.str() << "' program... ";
 
-  if (sys::Program::ExecuteAndWait(prog, &args[0], 0, 0, 0, 0, &ErrMsg)) {
-    errs() << "Error: " << ErrMsg << "\n";
+  if (!ExecGraphViewer(prog, args, Filename, wait, ErrMsg))
     return;
-  }
-  errs() << " done. \n";
 
   sys::Path gv(LLVM_PATH_GV);
   args.clear();
@@ -165,19 +175,11 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   args.push_back(PSFilename.c_str());
   args.push_back("--spartan");
   args.push_back(0);
-  
+
   ErrMsg.clear();
-  if (wait) {
-     if (sys::Program::ExecuteAndWait(gv, &args[0],0,0,0,0,&ErrMsg))
-        errs() << "Error: " << ErrMsg << "\n";
-     Filename.eraseFromDisk();
-     PSFilename.eraseFromDisk();
-  }
-  else {
-     sys::Program::ExecuteNoWait(gv, &args[0],0,0,0,&ErrMsg);
-     errs() << "Remember to erase graph files: " << Filename.str() << " "
-            << PSFilename.str() << "\n";
-  }
+  if (!ExecGraphViewer(gv, args, PSFilename, wait, ErrMsg))
+    return;
+
 #elif HAVE_DOTTY
   sys::Path dotty(LLVM_PATH_DOTTY);
 
@@ -185,16 +187,13 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   args.push_back(dotty.c_str());
   args.push_back(Filename.c_str());
   args.push_back(0);
-  
-  errs() << "Running 'dotty' program... ";
-  if (sys::Program::ExecuteAndWait(dotty, &args[0],0,0,0,0,&ErrMsg)) {
-     errs() << "Error: " << ErrMsg << "\n";
-  } else {
+
 // Dotty spawns another app and doesn't wait until it returns
 #if defined (__MINGW32__) || defined (_WINDOWS)
-    return;
+  wait = false;
 #endif
-    Filename.eraseFromDisk();
-  }
+  errs() << "Running 'dotty' program... ";
+  if (!ExecGraphViewer(dotty, args, Filename, wait, ErrMsg))
+    return;
 #endif
 }
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 4b15587..911a03f 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -336,7 +336,11 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
       // Error while reading.
       return error_code(errno, posix_category());
     }
-    assert(NumRead != 0 && "fstat reported an invalid file size.");
+    if (NumRead == 0) {
+      assert(0 && "We got inaccurate FileSize value or fstat reported an "
+                   "invalid file size.");
+      break;
+    }
     BytesLeft -= NumRead;
     BufPtr += NumRead;
   }
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index 997ce0b..68d9c29 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/MathExtras.h"
+#include <algorithm>
 #include <cstdlib>
 
 using namespace llvm;
@@ -223,6 +224,56 @@ void SmallPtrSetImpl::CopyFrom(const SmallPtrSetImpl &RHS) {
   NumTombstones = RHS.NumTombstones;
 }
 
+void SmallPtrSetImpl::swap(SmallPtrSetImpl &RHS) {
+  if (this == &RHS) return;
+
+  // We can only avoid copying elements if neither set is small.
+  if (!this->isSmall() && !RHS.isSmall()) {
+    std::swap(this->CurArray, RHS.CurArray);
+    std::swap(this->CurArraySize, RHS.CurArraySize);
+    std::swap(this->NumElements, RHS.NumElements);
+    std::swap(this->NumTombstones, RHS.NumTombstones);
+    return;
+  }
+
+  // FIXME: From here on we assume that both sets have the same small size.
+
+  // If only RHS is small, copy the small elements into LHS and move the pointer
+  // from LHS to RHS.
+  if (!this->isSmall() && RHS.isSmall()) {
+    std::copy(RHS.SmallArray, RHS.SmallArray+RHS.CurArraySize,
+              this->SmallArray);
+    std::swap(this->NumElements, RHS.NumElements);
+    std::swap(this->CurArraySize, RHS.CurArraySize);
+    RHS.CurArray = this->CurArray;
+    RHS.NumTombstones = this->NumTombstones;
+    this->CurArray = this->SmallArray;
+    this->NumTombstones = 0;
+    return;
+  }
+
+  // If only LHS is small, copy the small elements into RHS and move the pointer
+  // from RHS to LHS.
+  if (this->isSmall() && !RHS.isSmall()) {
+    std::copy(this->SmallArray, this->SmallArray+this->CurArraySize,
+              RHS.SmallArray);
+    std::swap(RHS.NumElements, this->NumElements);
+    std::swap(RHS.CurArraySize, this->CurArraySize);
+    this->CurArray = RHS.CurArray;
+    this->NumTombstones = RHS.NumTombstones;
+    RHS.CurArray = RHS.SmallArray;
+    RHS.NumTombstones = 0;
+    return;
+  }
+
+  // Both a small, just swap the small elements.
+  assert(this->isSmall() && RHS.isSmall());
+  assert(this->CurArraySize == RHS.CurArraySize);
+  std::swap_ranges(this->SmallArray, this->SmallArray+this->CurArraySize,
+                   RHS.SmallArray);
+  std::swap(this->NumElements, RHS.NumElements);
+}
+
 SmallPtrSetImpl::~SmallPtrSetImpl() {
   if (!isSmall())
     free(CurArray);
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index 1c28bf8..abe570f 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -285,8 +285,8 @@ static unsigned GetAutoSenseRadix(StringRef &Str) {
 
 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
 /// sequence of radix up to 36 to an unsigned long long value.
-static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix,
-                                 unsigned long long &Result) {
+bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
+                                unsigned long long &Result) {
   // Autosense radix if not specified.
   if (Radix == 0)
     Radix = GetAutoSenseRadix(Str);
@@ -326,17 +326,13 @@ static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix,
   return false;
 }
 
-bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const {
-  return GetAsUnsignedInteger(*this, Radix, Result);
-}
-
-
-bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
+bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
+                              long long &Result) {
   unsigned long long ULLVal;
 
   // Handle positive strings first.
-  if (empty() || front() != '-') {
-    if (GetAsUnsignedInteger(*this, Radix, ULLVal) ||
+  if (Str.empty() || Str.front() != '-') {
+    if (getAsUnsignedInteger(Str, Radix, ULLVal) ||
         // Check for value so large it overflows a signed value.
         (long long)ULLVal < 0)
       return true;
@@ -345,7 +341,7 @@ bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
   }
 
   // Get the positive part of the value.
-  if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) ||
+  if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) ||
       // Reject values so large they'd overflow as negative signed, but allow
       // "-0".  This negates the unsigned so that the negative isn't undefined
       // on signed overflow.
@@ -356,24 +352,6 @@ bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
   return false;
 }
 
-bool StringRef::getAsInteger(unsigned Radix, int &Result) const {
-  long long Val;
-  if (getAsInteger(Radix, Val) ||
-      (int)Val != Val)
-    return true;
-  Result = Val;
-  return false;
-}
-
-bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const {
-  unsigned long long Val;
-  if (getAsInteger(Radix, Val) ||
-      (unsigned)Val != Val)
-    return true;
-  Result = Val;
-  return false;
-}
-
 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
   StringRef Str = *this;
 
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 94333a3..d261c53 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -29,6 +29,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case msp430:  return "msp430";
   case ppc64:   return "powerpc64";
   case ppc:     return "powerpc";
+  case r600:    return "r600";
   case sparc:   return "sparc";
   case sparcv9: return "sparcv9";
   case tce:     return "tce";
@@ -63,6 +64,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case hexagon:   return "hexagon";
 
+  case r600:    return "r600";
+
   case sparcv9:
   case sparc:   return "sparc";
 
@@ -145,6 +148,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("ppc32", ppc)
     .Case("ppc", ppc)
     .Case("mblaze", mblaze)
+    .Case("r600", r600)
     .Case("hexagon", hexagon)
     .Case("sparc", sparc)
     .Case("sparcv9", sparcv9)
@@ -184,6 +188,7 @@ Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) {
     // This is derived from the driver driver.
     .Cases("arm", "armv4t", "armv5", "armv6", Triple::arm)
     .Cases("armv7", "armv7f", "armv7k", "armv7s", "xscale", Triple::arm)
+    .Case("r600", Triple::r600)
     .Case("ptx32", Triple::ptx32)
     .Case("ptx64", Triple::ptx64)
     .Case("amdil", Triple::amdil)
@@ -206,6 +211,7 @@ const char *Triple::getArchNameForAssembler() {
     .Cases("armv5", "armv5e", "thumbv5", "thumbv5e", "armv5")
     .Cases("armv6", "thumbv6", "armv6")
     .Cases("armv7", "thumbv7", "armv7")
+    .Case("r600", "r600")
     .Case("ptx32", "ptx32")
     .Case("ptx64", "ptx64")
     .Case("le32", "le32")
@@ -234,6 +240,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
     .Cases("mips64", "mips64eb", Triple::mips64)
     .Case("mips64el", Triple::mips64el)
+    .Case("r600", Triple::r600)
     .Case("hexagon", Triple::hexagon)
     .Case("sparc", Triple::sparc)
     .Case("sparcv9", Triple::sparcv9)
@@ -641,6 +648,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::mipsel:
   case llvm::Triple::ppc:
   case llvm::Triple::ptx32:
+  case llvm::Triple::r600:
   case llvm::Triple::sparc:
   case llvm::Triple::tce:
   case llvm::Triple::thumb:
@@ -689,6 +697,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::mipsel:
   case Triple::ppc:
   case Triple::ptx32:
+  case Triple::r600:
   case Triple::sparc:
   case Triple::tce:
   case Triple::thumb:
@@ -718,6 +727,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::le32:
   case Triple::mblaze:
   case Triple::msp430:
+  case Triple::r600:
   case Triple::tce:
   case Triple::thumb:
   case Triple::xcore:
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 12d1b1a..93eed24 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -1726,12 +1726,6 @@ void Record::setName(Init *NewName) {
   }  // Otherwise this isn't yet registered.
   Name = NewName;
   checkName();
-  // Since the Init for the name was changed, see if we can resolve
-  // any of it using members of the Record.
-  Init *ComputedName = Name->resolveReferences(*this, 0);
-  if (ComputedName != Name) {
-    setName(ComputedName);
-  }
   // DO NOT resolve record values to the name at this point because
   // there might be default values for arguments of this def.  Those
   // arguments might not have been resolved yet so we don't want to
@@ -1754,6 +1748,8 @@ void Record::setName(const std::string &Name) {
 /// references.
 void Record::resolveReferencesTo(const RecordVal *RV) {
   for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+    if (RV == &Values[i]) // Skip resolve the same field as the given one
+      continue;
     if (Init *V = Values[i].getValue())
       Values[i].setValue(V->resolveReferences(*this, RV));
   }
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index acb57f7..2a1e8e4 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -18,9 +18,7 @@
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
-#include <cassert>
 
 namespace llvm {
 
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 4ec19cc..ca30716 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "asm-printer"
-#include "ARM.h"
 #include "ARMAsmPrinter.h"
+#include "ARM.h"
 #include "ARMBuildAttrs.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 75b796e..366e2fa 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -935,6 +935,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+          MIB.addReg(DestReg, RegState::ImplicitDefine);
       }
     } else
       llvm_unreachable("Unknown reg class!");
@@ -953,6 +955,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
+      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        MIB.addReg(DestReg, RegState::ImplicitDefine);
     } else
       llvm_unreachable("Unknown reg class!");
     break;
@@ -2756,24 +2760,24 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::VLD4q8oddPseudo_UPD:
     case ARM::VLD4q16oddPseudo_UPD:
     case ARM::VLD4q32oddPseudo_UPD:
-    case ARM::VLD1DUPq8Pseudo:
-    case ARM::VLD1DUPq16Pseudo:
-    case ARM::VLD1DUPq32Pseudo:
-    case ARM::VLD1DUPq8PseudoWB_fixed:
-    case ARM::VLD1DUPq16PseudoWB_fixed:
-    case ARM::VLD1DUPq32PseudoWB_fixed:
-    case ARM::VLD1DUPq8PseudoWB_register:
-    case ARM::VLD1DUPq16PseudoWB_register:
-    case ARM::VLD1DUPq32PseudoWB_register:
-    case ARM::VLD2DUPd8Pseudo:
-    case ARM::VLD2DUPd16Pseudo:
-    case ARM::VLD2DUPd32Pseudo:
-    case ARM::VLD2DUPd8PseudoWB_fixed:
-    case ARM::VLD2DUPd16PseudoWB_fixed:
-    case ARM::VLD2DUPd32PseudoWB_fixed:
-    case ARM::VLD2DUPd8PseudoWB_register:
-    case ARM::VLD2DUPd16PseudoWB_register:
-    case ARM::VLD2DUPd32PseudoWB_register:
+    case ARM::VLD1DUPq8:
+    case ARM::VLD1DUPq16:
+    case ARM::VLD1DUPq32:
+    case ARM::VLD1DUPq8wb_fixed:
+    case ARM::VLD1DUPq16wb_fixed:
+    case ARM::VLD1DUPq32wb_fixed:
+    case ARM::VLD1DUPq8wb_register:
+    case ARM::VLD1DUPq16wb_register:
+    case ARM::VLD1DUPq32wb_register:
+    case ARM::VLD2DUPd8:
+    case ARM::VLD2DUPd16:
+    case ARM::VLD2DUPd32:
+    case ARM::VLD2DUPd8wb_fixed:
+    case ARM::VLD2DUPd16wb_fixed:
+    case ARM::VLD2DUPd32wb_fixed:
+    case ARM::VLD2DUPd8wb_register:
+    case ARM::VLD2DUPd16wb_register:
+    case ARM::VLD2DUPd32wb_register:
     case ARM::VLD4DUPd8Pseudo:
     case ARM::VLD4DUPd16Pseudo:
     case ARM::VLD4DUPd32Pseudo:
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index d2aff9a..291369f 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMBaseRegisterInfo.h"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMBaseRegisterInfo.h"
 #include "ARMFrameLowering.h"
 #include "ARMInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
@@ -79,6 +79,7 @@ getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(ARM::SP);
   Reserved.set(ARM::PC);
+  Reserved.set(ARM::FPSCR);
   if (TFI->hasFP(MF))
     Reserved.set(FramePtr);
   if (hasBasePointer(MF))
@@ -492,8 +493,7 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   // When outgoing call frames are so large that we adjust the stack pointer
   // around the call, we can no longer use the stack pointer to reach the
   // emergency spill slot.
-  if (needsStackRealignment(MF) && (MFI->hasVarSizedObjects() ||
-                                    !TFI->hasReservedCallFrame(MF)))
+  if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF))
     return true;
 
   // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
@@ -517,7 +517,6 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
 }
 
 bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   // We can't realign the stack if:
@@ -532,8 +531,9 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   // register allocation with frame pointer elimination, it is too late now.
   if (!MRI->canReserveReg(FramePtr))
     return false;
-  // We may also need a base pointer if there are dynamic allocas.
-  if (!MFI->hasVarSizedObjects())
+  // We may also need a base pointer if there are dynamic allocas or stack
+  // pointer adjustments around calls.
+  if (MF.getTarget().getFrameLowering()->hasReservedCallFrame(MF))
     return true;
   if (!EnableBasePointer)
     return false;
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 437b4c7..2b9c55d 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -15,13 +15,13 @@
 #ifndef ARMCALLINGCONV_H
 #define ARMCALLINGCONV_H
 
-#include "llvm/CallingConv.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMRegisterInfo.h"
 #include "ARMSubtarget.h"
-#include "ARM.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
 
@@ -29,7 +29,7 @@ namespace llvm {
 static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                           CCValAssign::LocInfo &LocInfo,
                           CCState &State, bool CanFail) {
-  static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+  static const uint16_t RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
 
   // Try to get the first register.
   if (unsigned Reg = State.AllocateReg(RegList, 4))
@@ -72,9 +72,9 @@ static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                            CCValAssign::LocInfo &LocInfo,
                            CCState &State, bool CanFail) {
-  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
-  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
-  static const unsigned ShadowRegList[] = { ARM::R0, ARM::R1 };
+  static const uint16_t HiRegList[] = { ARM::R0, ARM::R2 };
+  static const uint16_t LoRegList[] = { ARM::R1, ARM::R3 };
+  static const uint16_t ShadowRegList[] = { ARM::R0, ARM::R1 };
 
   unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2);
   if (Reg == 0) {
@@ -118,8 +118,8 @@ static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 
 static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                          CCValAssign::LocInfo &LocInfo, CCState &State) {
-  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
-  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
+  static const uint16_t HiRegList[] = { ARM::R0, ARM::R2 };
+  static const uint16_t LoRegList[] = { ARM::R1, ARM::R3 };
 
   unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
   if (Reg == 0)
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index c4ab99d..c2b7816 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -99,8 +99,8 @@ namespace {
   // Entries for NEON load/store information table.  The table is sorted by
   // PseudoOpc for fast binary-search lookups.
   struct NEONLdStTableEntry {
-    unsigned PseudoOpc;
-    unsigned RealOpc;
+    uint16_t PseudoOpc;
+    uint16_t RealOpc;
     bool IsLoad;
     bool isUpdating;
     bool hasWritebackOperand;
@@ -129,16 +129,6 @@ namespace {
 }
 
 static const NEONLdStTableEntry NEONLdStTable[] = {
-{ ARM::VLD1DUPq16Pseudo,     ARM::VLD1DUPq16,     true, false, false, SingleSpc, 2, 4,false},
-{ ARM::VLD1DUPq16PseudoWB_fixed, ARM::VLD1DUPq16wb_fixed, true, true, true,  SingleSpc, 2, 4,false},
-{ ARM::VLD1DUPq16PseudoWB_register, ARM::VLD1DUPq16wb_register, true, true, true,  SingleSpc, 2, 4,false},
-{ ARM::VLD1DUPq32Pseudo,     ARM::VLD1DUPq32,     true, false, false, SingleSpc, 2, 2,false},
-{ ARM::VLD1DUPq32PseudoWB_fixed, ARM::VLD1DUPq32wb_fixed, true, true, false,  SingleSpc, 2, 2,false},
-{ ARM::VLD1DUPq32PseudoWB_register, ARM::VLD1DUPq32wb_register, true, true, true,  SingleSpc, 2, 2,false},
-{ ARM::VLD1DUPq8Pseudo,      ARM::VLD1DUPq8,      true, false, false, SingleSpc, 2, 8,false},
-{ ARM::VLD1DUPq8PseudoWB_fixed,  ARM::VLD1DUPq8wb_fixed, true, true, false,  SingleSpc, 2, 8,false},
-{ ARM::VLD1DUPq8PseudoWB_register,  ARM::VLD1DUPq8wb_register, true, true, true,  SingleSpc, 2, 8,false},
-
 { ARM::VLD1LNq16Pseudo,     ARM::VLD1LNd16,     true, false, false, EvenDblSpc, 1, 4 ,true},
 { ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, true,  EvenDblSpc, 1, 4 ,true},
 { ARM::VLD1LNq32Pseudo,     ARM::VLD1LNd32,     true, false, false, EvenDblSpc, 1, 2 ,true},
@@ -149,16 +139,6 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 { ARM::VLD1d64QPseudo,      ARM::VLD1d64Q,     true,  false, false, SingleSpc,  4, 1 ,false},
 { ARM::VLD1d64TPseudo,      ARM::VLD1d64T,     true,  false, false, SingleSpc,  3, 1 ,false},
 
-{ ARM::VLD2DUPd16Pseudo,     ARM::VLD2DUPd16,     true, false, false, SingleSpc, 2, 4,false},
-{ ARM::VLD2DUPd16PseudoWB_fixed, ARM::VLD2DUPd16wb_fixed, true, true, false,  SingleSpc, 2, 4,false},
-{ ARM::VLD2DUPd16PseudoWB_register, ARM::VLD2DUPd16wb_register, true, true, true,  SingleSpc, 2, 4,false},
-{ ARM::VLD2DUPd32Pseudo,     ARM::VLD2DUPd32,     true, false, false, SingleSpc, 2, 2,false},
-{ ARM::VLD2DUPd32PseudoWB_fixed, ARM::VLD2DUPd32wb_fixed, true, true, false,  SingleSpc, 2, 2,false},
-{ ARM::VLD2DUPd32PseudoWB_register, ARM::VLD2DUPd32wb_register, true, true, true,  SingleSpc, 2, 2,false},
-{ ARM::VLD2DUPd8Pseudo,      ARM::VLD2DUPd8,      true, false, false, SingleSpc, 2, 8,false},
-{ ARM::VLD2DUPd8PseudoWB_fixed,  ARM::VLD2DUPd8wb_fixed, true, true, false,  SingleSpc, 2, 8,false},
-{ ARM::VLD2DUPd8PseudoWB_register,  ARM::VLD2DUPd8wb_register, true, true, true,  SingleSpc, 2, 8,false},
-
 { ARM::VLD2LNd16Pseudo,     ARM::VLD2LNd16,     true, false, false, SingleSpc,  2, 4 ,true},
 { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true,  SingleSpc,  2, 4 ,true},
 { ARM::VLD2LNd32Pseudo,     ARM::VLD2LNd32,     true, false, false, SingleSpc,  2, 2 ,true},
@@ -345,7 +325,7 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
 /// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON
 /// load or store pseudo instruction.
 static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
-  unsigned NumEntries = array_lengthof(NEONLdStTable);
+  const unsigned NumEntries = array_lengthof(NEONLdStTable);
 
 #ifndef NDEBUG
   // Make sure the table is sorted.
@@ -1090,24 +1070,6 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VLD4q8oddPseudo_UPD:
     case ARM::VLD4q16oddPseudo_UPD:
     case ARM::VLD4q32oddPseudo_UPD:
-    case ARM::VLD1DUPq8Pseudo:
-    case ARM::VLD1DUPq16Pseudo:
-    case ARM::VLD1DUPq32Pseudo:
-    case ARM::VLD1DUPq8PseudoWB_fixed:
-    case ARM::VLD1DUPq16PseudoWB_fixed:
-    case ARM::VLD1DUPq32PseudoWB_fixed:
-    case ARM::VLD1DUPq8PseudoWB_register:
-    case ARM::VLD1DUPq16PseudoWB_register:
-    case ARM::VLD1DUPq32PseudoWB_register:
-    case ARM::VLD2DUPd8Pseudo:
-    case ARM::VLD2DUPd16Pseudo:
-    case ARM::VLD2DUPd32Pseudo:
-    case ARM::VLD2DUPd8PseudoWB_fixed:
-    case ARM::VLD2DUPd16PseudoWB_fixed:
-    case ARM::VLD2DUPd32PseudoWB_fixed:
-    case ARM::VLD2DUPd8PseudoWB_register:
-    case ARM::VLD2DUPd16PseudoWB_register:
-    case ARM::VLD2DUPd32PseudoWB_register:
     case ARM::VLD3DUPd8Pseudo:
     case ARM::VLD3DUPd16Pseudo:
     case ARM::VLD3DUPd32Pseudo:
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 818b202..a24eab4 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -1384,7 +1384,10 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
         SrcVT == MVT::i1) {
       const APInt &CIVal = ConstInt->getValue();
       Imm = (isZExt) ? (int)CIVal.getZExtValue() : (int)CIVal.getSExtValue();
-      if (Imm < 0) {
+      // For INT_MIN/LONG_MIN (i.e., 0x80000000) we need to use a cmp, rather
+      // then a cmn, because there is no way to represent 2147483648 as a 
+      // signed 32-bit int.
+      if (Imm < 0 && Imm != (int)0x80000000) {
         isNegativeImm = true;
         Imm = -Imm;
       }
@@ -1475,7 +1478,6 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
 
 bool ARMFastISel::SelectCmp(const Instruction *I) {
   const CmpInst *CI = cast<CmpInst>(I);
-  Type *Ty = CI->getOperand(0)->getType();
 
   // Get the compare predicate.
   ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate());
@@ -1495,11 +1497,10 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   unsigned DestReg = createResultReg(RC);
   Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
   unsigned ZeroReg = TargetMaterializeConstant(Zero);
-  bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy());
-  unsigned CondReg = isFloat ? ARM::FPSCR : ARM::CPSR;
+  // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), DestReg)
           .addReg(ZeroReg).addImm(1)
-          .addImm(ARMPred).addReg(CondReg);
+          .addImm(ARMPred).addReg(ARM::CPSR);
 
   UpdateValueMap(I, DestReg);
   return true;
@@ -1851,6 +1852,48 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
   CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false));
 
+  // Check that we can handle all of the arguments. If we can't, then bail out
+  // now before we add code to the MBB.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // We don't handle NEON/vector parameters yet.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+      return false;
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      continue;
+    } else if (VA.needsCustom()) {
+      // TODO: We need custom lowering for vector (v2f64) args.
+      if (VA.getLocVT() != MVT::f64 ||
+          // TODO: Only handle register args for now.
+          !VA.isRegLoc() || !ArgLocs[++i].isRegLoc())
+        return false;
+    } else {
+      switch (static_cast<EVT>(ArgVT).getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i1:
+      case MVT::i8:
+      case MVT::i16:
+      case MVT::i32:
+        break;
+      case MVT::f32:
+        if (!Subtarget->hasVFP2())
+          return false;
+        break;
+      case MVT::f64:
+        if (!Subtarget->hasVFP2())
+          return false;
+        break;
+      }
+    }
+  }
+
+  // At the point, we are able to handle the call's arguments in fast isel.
+
   // Get a count of how many bytes are to be pushed on the stack.
   NumBytes = CCInfo.getNextStackOffset();
 
@@ -1866,9 +1909,8 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
     unsigned Arg = ArgRegs[VA.getValNo()];
     MVT ArgVT = ArgVTs[VA.getValNo()];
 
-    // We don't handle NEON/vector parameters yet.
-    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
-      return false;
+    assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) &&
+           "We don't handle NEON/vector parameters yet.");
 
     // Handle arg promotion, etc.
     switch (VA.getLocInfo()) {
@@ -1908,12 +1950,13 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       RegArgs.push_back(VA.getLocReg());
     } else if (VA.needsCustom()) {
       // TODO: We need custom lowering for vector (v2f64) args.
-      if (VA.getLocVT() != MVT::f64) return false;
+      assert(VA.getLocVT() == MVT::f64 &&
+             "Custom lowering for v2f64 args not available");
 
       CCValAssign &NextVA = ArgLocs[++i];
 
-      // TODO: Only handle register args for now.
-      if(!(VA.isRegLoc() && NextVA.isRegLoc())) return false;
+      assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+             "We only handle register args!");
 
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                               TII.get(ARM::VMOVRRD), VA.getLocReg())
@@ -1929,9 +1972,11 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       Addr.Base.Reg = ARM::SP;
       Addr.Offset = VA.getLocMemOffset();
 
-      if (!ARMEmitStore(ArgVT, Arg, Addr)) return false;
+      bool EmitRet = ARMEmitStore(ArgVT, Arg, Addr); (void)EmitRet;
+      assert(EmitRet && "Could not emit a store for argument!");
     }
   }
+
   return true;
 }
 
@@ -2136,7 +2181,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   // TODO: Turn this into the table of arm call ops.
   MachineInstrBuilder MIB;
   unsigned CallOpc = ARMSelectCallOp(NULL);
-  if(isThumb2)
+  if (isThumb2)
     // Explicitly adding the predicate here.
     MIB = AddDefaultPred(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                          TII.get(CallOpc)))
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 0fd6025..bd4b2a9 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -501,7 +501,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
 
   // SP can move around if there are allocas.  We may also lose track of SP
   // when emergency spilling inside a non-reserved call frame setup.
-  bool hasMovingSP = MFI->hasVarSizedObjects() || !hasReservedCallFrame(MF);
+  bool hasMovingSP = !hasReservedCallFrame(MF);
 
   // When dynamically realigning the stack, use the frame pointer for
   // parameters, and the stack/base pointer for locals.
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index c99db98..ffb9acb 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1589,9 +1589,9 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
   case ARM::VST2q16PseudoWB_fixed: return ARM::VST2q16PseudoWB_register;
   case ARM::VST2q32PseudoWB_fixed: return ARM::VST2q32PseudoWB_register;
 
-  case ARM::VLD2DUPd8PseudoWB_fixed: return ARM::VLD2DUPd8PseudoWB_register;
-  case ARM::VLD2DUPd16PseudoWB_fixed: return ARM::VLD2DUPd16PseudoWB_register;
-  case ARM::VLD2DUPd32PseudoWB_fixed: return ARM::VLD2DUPd32PseudoWB_register;
+  case ARM::VLD2DUPd8wb_fixed: return ARM::VLD2DUPd8wb_register;
+  case ARM::VLD2DUPd16wb_fixed: return ARM::VLD2DUPd16wb_register;
+  case ARM::VLD2DUPd32wb_fixed: return ARM::VLD2DUPd32wb_register;
   }
   return Opc; // If not one we handle, return it unchanged.
 }
@@ -2891,8 +2891,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VLD2DUP: {
-    unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo,
-                           ARM::VLD2DUPd32Pseudo };
+    unsigned Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+                           ARM::VLD2DUPd32 };
     return SelectVLDDup(N, false, 2, Opcodes);
   }
 
@@ -2909,9 +2909,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VLD2DUP_UPD: {
-    unsigned Opcodes[] = { ARM::VLD2DUPd8PseudoWB_fixed,
-                           ARM::VLD2DUPd16PseudoWB_fixed,
-                           ARM::VLD2DUPd32PseudoWB_fixed };
+    unsigned Opcodes[] = { ARM::VLD2DUPd8wb_fixed, ARM::VLD2DUPd16wb_fixed,
+                           ARM::VLD2DUPd32wb_fixed };
     return SelectVLDDup(N, true, 2, Opcodes);
   }
 
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 477b5f4..e26dd22 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -13,10 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "arm-isel"
+#include "ARMISelLowering.h"
 #include "ARM.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
-#include "ARMISelLowering.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMPerfectShuffle.h"
 #include "ARMRegisterInfo.h"
@@ -49,7 +49,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include <sstream>
 using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
@@ -87,7 +86,7 @@ namespace {
 }
 
 // The APCS parameter registers.
-static const unsigned GPRArgRegs[] = {
+static const uint16_t GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
 };
 
@@ -456,6 +455,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
   }
 
+  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
+
   if (Subtarget->hasNEON()) {
     addDRTypeForNEON(MVT::v2f32);
     addDRTypeForNEON(MVT::v8i8);
@@ -3673,6 +3674,27 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   return Result;
 }
 
+SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+                                           const ARMSubtarget *ST) const {
+  if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16())
+    return SDValue();
+
+  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
+  assert(Op.getValueType() == MVT::f32 &&
+         "ConstantFP custom lowering should only occur for f32.");
+
+  APFloat FPVal = CFP->getValueAPF();
+  int ImmVal = ARM_AM::getFP32Imm(FPVal);
+  if (ImmVal == -1)
+    return SDValue();
+
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
+  SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, NewVal);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
+                     DAG.getConstant(0, MVT::i32));
+}
+
 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
 /// valid vector constant for a NEON instruction with a "modified immediate"
 /// operand (e.g., VMOV).  If so, return the encoded value.
@@ -5109,6 +5131,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
   case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
+  case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
@@ -6842,33 +6865,63 @@ static SDValue PerformMULCombine(SDNode *N,
   if (!C)
     return SDValue();
 
-  uint64_t MulAmt = C->getZExtValue();
+  int64_t MulAmt = C->getSExtValue();
   unsigned ShiftAmt = CountTrailingZeros_64(MulAmt);
+
   ShiftAmt = ShiftAmt & (32 - 1);
   SDValue V = N->getOperand(0);
   DebugLoc DL = N->getDebugLoc();
 
   SDValue Res;
   MulAmt >>= ShiftAmt;
-  if (isPowerOf2_32(MulAmt - 1)) {
-    // (mul x, 2^N + 1) => (add (shl x, N), x)
-    Res = DAG.getNode(ISD::ADD, DL, VT,
-                      V, DAG.getNode(ISD::SHL, DL, VT,
-                                     V, DAG.getConstant(Log2_32(MulAmt-1),
-                                                        MVT::i32)));
-  } else if (isPowerOf2_32(MulAmt + 1)) {
-    // (mul x, 2^N - 1) => (sub (shl x, N), x)
-    Res = DAG.getNode(ISD::SUB, DL, VT,
-                      DAG.getNode(ISD::SHL, DL, VT,
-                                  V, DAG.getConstant(Log2_32(MulAmt+1),
-                                                     MVT::i32)),
-                                                     V);
-  } else
-    return SDValue();
+
+  if (MulAmt >= 0) {
+    if (isPowerOf2_32(MulAmt - 1)) {
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      Res = DAG.getNode(ISD::ADD, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmt - 1),
+                                                    MVT::i32)));
+    } else if (isPowerOf2_32(MulAmt + 1)) {
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmt + 1),
+                                                    MVT::i32)),
+                        V);
+    } else
+      return SDValue();
+  } else {
+    uint64_t MulAmtAbs = -MulAmt;
+    if (isPowerOf2_32(MulAmtAbs + 1)) {
+      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmtAbs + 1),
+                                                    MVT::i32)));
+    } else if (isPowerOf2_32(MulAmtAbs - 1)) {
+      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+      Res = DAG.getNode(ISD::ADD, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmtAbs-1),
+                                                    MVT::i32)));
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        DAG.getConstant(0, MVT::i32),Res);
+
+    } else
+      return SDValue();
+  }
 
   if (ShiftAmt != 0)
-    Res = DAG.getNode(ISD::SHL, DL, VT, Res,
-                      DAG.getConstant(ShiftAmt, MVT::i32));
+    Res = DAG.getNode(ISD::SHL, DL, VT,
+                      Res, DAG.getConstant(ShiftAmt, MVT::i32));
 
   // Do not add new nodes to DAG combiner worklist.
   DCI.CombineTo(N, Res, false);
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 7f12293..a71b74e 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -15,6 +15,7 @@
 #ifndef ARMISELLOWERING_H
 #define ARMISELLOWERING_H
 
+#include "ARM.h"
 #include "ARMSubtarget.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -434,6 +435,8 @@ namespace llvm {
     SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+                            const ARMSubtarget *ST) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
 
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 7bedf30..72af535 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -14,11 +14,11 @@
 #ifndef ARMINSTRUCTIONINFO_H
 #define ARMINSTRUCTIONINFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
+#include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMRegisterInfo.h"
 #include "ARMSubtarget.h"
-#include "ARM.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
   class ARMSubtarget;
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 0b1406e..8196582 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -637,6 +637,7 @@ def BitfieldAsmOperand : AsmOperandClass {
   let Name = "Bitfield";
   let ParserMethod = "parseBitfield";
 }
+
 def bf_inv_mask_imm : Operand<i32>,
                       PatLeaf<(imm), [{
   return ARM::isBitFieldInvertedMask(N->getZExtValue());
@@ -4084,74 +4085,43 @@ def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
                 RegConstraint<"$false = $Rd">;
 
-let isCodeGenOnly = 1 in {
 // Conditional instructions
-multiclass AsI1_bincc_irs<bits<4> opcod, string opc,
-                   InstrItinClass iii, InstrItinClass iir, InstrItinClass iis> {
-  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
-               iii, opc, "\t$Rd, $Rn, $imm", []>,
-               RegConstraint<"$Rn = $Rd"> {
-    bits<4> Rd;
-    bits<4> Rn;
-    bits<12> imm;
-    let Inst{25} = 1;
-    let Inst{19-16} = Rn;
-    let Inst{15-12} = Rd;
-    let Inst{11-0} = imm;
-  }
-  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
-               iir, opc, "\t$Rd, $Rn, $Rm", []>,
-               RegConstraint<"$Rn = $Rd"> {
-    bits<4> Rd;
-    bits<4> Rn;
-    bits<4> Rm;
-    let Inst{25} = 0;
-    let Inst{19-16} = Rn;
-    let Inst{15-12} = Rd;
-    let Inst{11-4} = 0b00000000;
-    let Inst{3-0} = Rm;
-  }
-
-  def rsi : AsI1<opcod, (outs GPR:$Rd),
-               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
-               iis, opc, "\t$Rd, $Rn, $shift", []>,
-               RegConstraint<"$Rn = $Rd"> {
-    bits<4> Rd;
-    bits<4> Rn;
-    bits<12> shift;
-    let Inst{25} = 0;
-    let Inst{19-16} = Rn;
-    let Inst{15-12} = Rd;
-    let Inst{11-5} = shift{11-5};
-    let Inst{4} = 0;
-    let Inst{3-0} = shift{3-0};
-  }
-
-  def rsr : AsI1<opcod, (outs GPR:$Rd),
-               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
-               iis, opc, "\t$Rd, $Rn, $shift", []>,
-               RegConstraint<"$Rn = $Rd"> {
-    bits<4> Rd;
-    bits<4> Rn;
-    bits<12> shift;
-    let Inst{25} = 0;
-    let Inst{19-16} = Rn;
-    let Inst{15-12} = Rd;
-    let Inst{11-8} = shift{11-8};
-    let Inst{7} = 0;
-    let Inst{6-5} = shift{6-5};
-    let Inst{4} = 1;
-    let Inst{3-0} = shift{3-0};
-  }
-} // AsI1_bincc_irs
-
-defm ANDCC : AsI1_bincc_irs<0b0000, "and", IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm ORRCC : AsI1_bincc_irs<0b1100, "orr", IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm EORCC : AsI1_bincc_irs<0b0001, "eor", IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
+multiclass AsI1_bincc_irs<Instruction iri, Instruction irr, Instruction irsi,
+                          Instruction irsr,
+                          InstrItinClass iii, InstrItinClass iir,
+                          InstrItinClass iis> {
+  def ri  : ARMPseudoExpand<(outs GPR:$Rd),
+                            (ins GPR:$Rn, so_imm:$imm, pred:$p, cc_out:$s),
+                            4, iii, [],
+                       (iri GPR:$Rd, GPR:$Rn, so_imm:$imm, pred:$p, cc_out:$s)>,
+                            RegConstraint<"$Rn = $Rd">;
+  def rr  : ARMPseudoExpand<(outs GPR:$Rd),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                            4, iir, [],
+                           (irr GPR:$Rd, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+                            RegConstraint<"$Rn = $Rd">;
+  def rsi : ARMPseudoExpand<(outs GPR:$Rd),
+                           (ins GPR:$Rn, so_reg_imm:$shift, pred:$p, cc_out:$s),
+                            4, iis, [],
+                (irsi GPR:$Rd, GPR:$Rn, so_reg_imm:$shift, pred:$p, cc_out:$s)>,
+                            RegConstraint<"$Rn = $Rd">;
+  def rsr : ARMPseudoExpand<(outs GPRnopc:$Rd),
+                       (ins GPRnopc:$Rn, so_reg_reg:$shift, pred:$p, cc_out:$s),
+                            4, iis, [],
+                (irsr GPR:$Rd, GPR:$Rn, so_reg_reg:$shift, pred:$p, cc_out:$s)>,
+                            RegConstraint<"$Rn = $Rd">;
+}
+
+defm ANDCC : AsI1_bincc_irs<ANDri, ANDrr, ANDrsi, ANDrsr,
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
+defm ORRCC : AsI1_bincc_irs<ORRri, ORRrr, ORRrsi, ORRrsr,
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
+defm EORCC : AsI1_bincc_irs<EORri, EORrr, EORrsi, EORrsr,
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
 
-} // isCodeGenOnly
 } // neverHasSideEffects
 
+
 //===----------------------------------------------------------------------===//
 // Atomic operations intrinsics
 //
@@ -4605,10 +4575,16 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
                          c_imm:$CRm, imm0_7:$opc2),
                     [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                                   imm:$CRm, imm:$opc2)]>;
+def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                        c_imm:$CRm, 0, pred:$p)>;
 def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
                     (outs GPR:$Rt),
                     (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                          imm0_7:$opc2), []>;
+def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MRC GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                        c_imm:$CRm, 0, pred:$p)>;
 
 def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
              (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
@@ -4642,10 +4618,16 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                            c_imm:$CRm, imm0_7:$opc2),
                       [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                                      imm:$CRm, imm:$opc2)]>;
+def : ARMInstAlias<"mcr2$ $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (outs GPR:$Rt),
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                            imm0_7:$opc2), []>;
+def : ARMInstAlias<"mrc2$ $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MRC2 GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
 
 def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
                               imm:$CRm, imm:$opc2),
@@ -5252,6 +5234,20 @@ def : ARMInstAlias<"mul${s}${p} $Rn, $Rm",
 def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
                    (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;
 
+// Pre-v6, 'mov r0, r0' was used as a NOP encoding.
+def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
+         Requires<[IsARM, NoV6]>;
+
+// UMULL/SMULL are available on all arches, but the instruction definitions
+// need difference constraints pre-v6. Use these aliases for the assembly
+// parsing on pre-v6.
+def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+
 // 'it' blocks in ARM mode just validate the predicates. The IT itself
 // is discarded.
 def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 8684ce1..f61eb2b 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -94,7 +94,7 @@ def VecListDPairAsmOperand : AsmOperandClass {
   let ParserMethod = "parseVectorList";
   let RenderMethod = "addVecListOperands";
 }
-def VecListDPair : RegisterOperand<DPair, "printVectorListDPair"> {
+def VecListDPair : RegisterOperand<DPair, "printVectorListTwo"> {
   let ParserMatchClass = VecListDPairAsmOperand;
 }
 // Register list of three sequential D registers.
@@ -121,7 +121,7 @@ def VecListDPairSpacedAsmOperand : AsmOperandClass {
   let ParserMethod = "parseVectorList";
   let RenderMethod = "addVecListOperands";
 }
-def VecListDPairSpaced : RegisterOperand<DPair, "printVectorListDPairSpaced"> {
+def VecListDPairSpaced : RegisterOperand<DPair, "printVectorListTwoSpaced"> {
   let ParserMatchClass = VecListDPairSpacedAsmOperand;
 }
 // Register list of three D registers spaced by 2 (three Q registers).
@@ -153,23 +153,24 @@ def VecListOneDAllLanes : RegisterOperand<DPR, "printVectorListOneAllLanes"> {
   let ParserMatchClass = VecListOneDAllLanesAsmOperand;
 }
 // Register list of two D registers, with "all lanes" subscripting.
-def VecListTwoDAllLanesAsmOperand : AsmOperandClass {
-  let Name = "VecListTwoDAllLanes";
+def VecListDPairAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairAllLanes";
   let ParserMethod = "parseVectorList";
   let RenderMethod = "addVecListOperands";
 }
-def VecListTwoDAllLanes : RegisterOperand<DPR, "printVectorListTwoAllLanes"> {
-  let ParserMatchClass = VecListTwoDAllLanesAsmOperand;
+def VecListDPairAllLanes : RegisterOperand<DPair,
+                                           "printVectorListTwoAllLanes"> {
+  let ParserMatchClass = VecListDPairAllLanesAsmOperand;
 }
 // Register list of two D registers spaced by 2 (two sequential Q registers).
-def VecListTwoQAllLanesAsmOperand : AsmOperandClass {
-  let Name = "VecListTwoQAllLanes";
+def VecListDPairSpacedAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairSpacedAllLanes";
   let ParserMethod = "parseVectorList";
   let RenderMethod = "addVecListOperands";
 }
-def VecListTwoQAllLanes : RegisterOperand<DPR,
+def VecListDPairSpacedAllLanes : RegisterOperand<DPair,
                                          "printVectorListTwoSpacedAllLanes"> {
-  let ParserMatchClass = VecListTwoQAllLanesAsmOperand;
+  let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand;
 }
 // Register list of three D registers, with "all lanes" subscripting.
 def VecListThreeDAllLanesAsmOperand : AsmOperandClass {
@@ -1276,39 +1277,32 @@ class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
 }
-class VLD1QDUPPseudo<ValueType Ty, PatFrag LoadOp> : VLDQPseudo<IIC_VLD1dup> {
-  let Pattern = [(set QPR:$dst,
-                      (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$addr)))))];
-}
-
 def VLD1DUPd8  : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8>;
 def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16>;
 def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load>;
 
-def VLD1DUPq8Pseudo  : VLD1QDUPPseudo<v16i8, extloadi8>;
-def VLD1DUPq16Pseudo : VLD1QDUPPseudo<v8i16, extloadi16>;
-def VLD1DUPq32Pseudo : VLD1QDUPPseudo<v4i32, load>;
-
 def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
           (VLD1DUPd32 addrmode6:$addr)>;
-def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
-          (VLD1DUPq32Pseudo addrmode6:$addr)>;
 
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
-
-class VLD1QDUP<bits<4> op7_4, string Dt>
-  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListTwoDAllLanes:$Vd),
+class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListDPairAllLanes:$Vd),
           (ins addrmode6dup:$Rn), IIC_VLD1dup,
-          "vld1", Dt, "$Vd, $Rn", "", []> {
+          "vld1", Dt, "$Vd, $Rn", "",
+          [(set VecListDPairAllLanes:$Vd,
+                (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 
-def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8">;
-def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16">;
-def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32">;
+def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8>;
+def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16>;
+def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load>;
 
+def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+          (VLD1DUPq32 addrmode6:$addr)>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // ...with address register writeback:
 multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
@@ -1333,7 +1327,7 @@ multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
 }
 multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
-                     (outs VecListTwoDAllLanes:$Vd, GPR:$wb),
+                     (outs VecListDPairAllLanes:$Vd, GPR:$wb),
                      (ins addrmode6dup:$Rn), IIC_VLD1dupu,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
@@ -1343,7 +1337,7 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
     let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<1, 0b10, 0b1100, op7_4,
-                        (outs VecListTwoDAllLanes:$Vd, GPR:$wb),
+                        (outs VecListDPairAllLanes:$Vd, GPR:$wb),
                         (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD1dupu,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1361,13 +1355,6 @@ defm VLD1DUPq8wb  : VLD1QDUPWB<{0,0,1,0}, "8">;
 defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16">;
 defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32">;
 
-def VLD1DUPq8PseudoWB_fixed     : VLDQWBfixedPseudo<IIC_VLD1dupu>;
-def VLD1DUPq16PseudoWB_fixed    : VLDQWBfixedPseudo<IIC_VLD1dupu>;
-def VLD1DUPq32PseudoWB_fixed    : VLDQWBfixedPseudo<IIC_VLD1dupu>;
-def VLD1DUPq8PseudoWB_register  : VLDQWBregisterPseudo<IIC_VLD1dupu>;
-def VLD1DUPq16PseudoWB_register : VLDQWBregisterPseudo<IIC_VLD1dupu>;
-def VLD1DUPq32PseudoWB_register : VLDQWBregisterPseudo<IIC_VLD1dupu>;
-
 //   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
 class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy>
   : NLdSt<1, 0b10, 0b1101, op7_4, (outs VdTy:$Vd),
@@ -1378,18 +1365,14 @@ class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy>
   let DecoderMethod = "DecodeVLD2DupInstruction";
 }
 
-def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8",  VecListTwoDAllLanes>;
-def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListTwoDAllLanes>;
-def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListTwoDAllLanes>;
+def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8",  VecListDPairAllLanes>;
+def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes>;
+def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes>;
 
-def VLD2DUPd8Pseudo  : VLDQPseudo<IIC_VLD2dup>;
-def VLD2DUPd16Pseudo : VLDQPseudo<IIC_VLD2dup>;
-def VLD2DUPd32Pseudo : VLDQPseudo<IIC_VLD2dup>;
-
-// ...with double-spaced registers (not used for codegen):
-def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8",  VecListTwoQAllLanes>;
-def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListTwoQAllLanes>;
-def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListTwoQAllLanes>;
+// ...with double-spaced registers
+def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8",  VecListDPairSpacedAllLanes>;
+def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes>;
+def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes>;
 
 // ...with address register writeback:
 multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
@@ -1414,20 +1397,13 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
   }
 }
 
-defm VLD2DUPd8wb    : VLD2DUPWB<{0,0,0,0}, "8",  VecListTwoDAllLanes>;
-defm VLD2DUPd16wb   : VLD2DUPWB<{0,1,0,?}, "16", VecListTwoDAllLanes>;
-defm VLD2DUPd32wb   : VLD2DUPWB<{1,0,0,?}, "32", VecListTwoDAllLanes>;
-
-defm VLD2DUPd8x2wb  : VLD2DUPWB<{0,0,1,0}, "8",  VecListTwoQAllLanes>;
-defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListTwoQAllLanes>;
-defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListTwoQAllLanes>;
+defm VLD2DUPd8wb    : VLD2DUPWB<{0,0,0,0}, "8",  VecListDPairAllLanes>;
+defm VLD2DUPd16wb   : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes>;
+defm VLD2DUPd32wb   : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes>;
 
-def VLD2DUPd8PseudoWB_fixed     : VLDQWBfixedPseudo   <IIC_VLD2dupu>;
-def VLD2DUPd8PseudoWB_register  : VLDQWBregisterPseudo<IIC_VLD2dupu>;
-def VLD2DUPd16PseudoWB_fixed    : VLDQWBfixedPseudo   <IIC_VLD2dupu>;
-def VLD2DUPd16PseudoWB_register : VLDQWBregisterPseudo<IIC_VLD2dupu>;
-def VLD2DUPd32PseudoWB_fixed    : VLDQWBfixedPseudo   <IIC_VLD2dupu>;
-def VLD2DUPd32PseudoWB_register : VLDQWBregisterPseudo<IIC_VLD2dupu>;
+defm VLD2DUPd8x2wb  : VLD2DUPWB<{0,0,1,0}, "8",  VecListDPairSpacedAllLanes>;
+defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes>;
+defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes>;
 
 //   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
 class VLD3DUP<bits<4> op7_4, string Dt>
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index e8984e1..1f7edc1 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -574,7 +574,7 @@ multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
                                                     cc_out:$s)>;
 
   // and with the optional destination operand, too.
-  def : t2InstAlias<!strconcat(opc, "${s}${p}.ri", " $Rdn, $imm"),
+  def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rdn, $imm"),
      (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
                                                     t2_so_imm:$imm, pred:$p,
                                                     cc_out:$s)>;
@@ -2952,45 +2952,36 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
                              (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
                              IIC_iCMOVsi, "ror", ".w\t$Rd, $Rm, $imm", []>,
                  RegConstraint<"$false = $Rd">;
+} // isCodeGenOnly = 1
 
-multiclass T2I_bincc_irs<bits<4> opcod, string opc,
+multiclass T2I_bincc_irs<Instruction iri, Instruction irr, Instruction irs,
                    InstrItinClass iii, InstrItinClass iir, InstrItinClass iis> {
    // shifted imm
-   def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
-                         iii, opc, ".w\t$Rd, $Rn, $imm", []>,
-                         RegConstraint<"$Rn = $Rd"> {
-     let Inst{31-27} = 0b11110;
-     let Inst{25} = 0;
-     let Inst{24-21} = opcod;
-     let Inst{15} = 0;
-   }
+   def ri : t2PseudoExpand<(outs rGPR:$Rd),
+                           (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s),
+                           4, iii, [],
+                  (iri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>,
+                           RegConstraint<"$Rn = $Rd">;
    // register
-   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
-                        iir, opc, ".w\t$Rd, $Rn, $Rm", []>,
-                        RegConstraint<"$Rn = $Rd"> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{14-12} = 0b000; // imm3
-     let Inst{7-6} = 0b00; // imm2
-     let Inst{5-4} = 0b00; // type
-   }
+   def rr : t2PseudoExpand<(outs rGPR:$Rd),
+                           (ins rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s),
+                           4, iir, [],
+                        (irr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>,
+                           RegConstraint<"$Rn = $Rd">;
    // shifted register
-   def rs : T2sTwoRegShiftedReg<(outs rGPR:$Rd),
-                                (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
-                                iis, opc, ".w\t$Rd, $Rn, $ShiftedRm", []>,
-                                RegConstraint<"$Rn = $Rd"> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-   }
+   def rs : t2PseudoExpand<(outs rGPR:$Rd),
+                       (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s),
+                           4, iis, [],
+            (irs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>,
+                           RegConstraint<"$Rn = $Rd">;
 } // T2I_bincc_irs
 
-defm t2ANDCC : T2I_bincc_irs<0b0000, "and", IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
-defm t2ORRCC : T2I_bincc_irs<0b0010, "orr", IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
-defm t2EORCC : T2I_bincc_irs<0b0100, "eor", IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
-
-} // isCodeGenOnly = 1
+defm t2ANDCC : T2I_bincc_irs<t2ANDri, t2ANDrr, t2ANDrs,
+                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
+defm t2ORRCC : T2I_bincc_irs<t2ORRri, t2ORRrr, t2ORRrs,
+                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
+defm t2EORCC : T2I_bincc_irs<t2EORri, t2EORrr, t2EORrs,
+                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -3768,20 +3759,32 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
                 c_imm:$CRm, imm0_7:$opc2),
            [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                          imm:$CRm, imm:$opc2)]>;
+def : t2InstAlias<"mcr $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
 def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
              (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                           c_imm:$CRm, imm0_7:$opc2),
              [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                             imm:$CRm, imm:$opc2)]>;
+def : t2InstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                          c_imm:$CRm, 0)>;
 
 /* from coprocessor to ARM core register */
 def t2MRC : t2MovRCopro<0b1110, "mrc", 1,
              (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                                   c_imm:$CRm, imm0_7:$opc2), []>;
+def : t2InstAlias<"mrc $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
 
 def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
              (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                                   c_imm:$CRm, imm0_7:$opc2), []>;
+def : t2InstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC2 GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                          c_imm:$CRm, 0)>;
 
 def : T2v6Pat<(int_arm_mrc  imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
               (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index aa10af7..e9d5720 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -206,6 +206,14 @@ def : InstAlias<"vpop${p} $r",  (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>,
                 Requires<[HasVFP2]>;
 def : InstAlias<"vpop${p} $r",  (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>,
                 Requires<[HasVFP2]>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+                         (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+                         (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+                         (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+                         (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>;
 
 // FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
 
@@ -286,7 +294,7 @@ def : Pat<(fmul (fneg SPR:$a), SPR:$b),
           (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
 
 // These are encoded as unary instructions.
-let Defs = [FPSCR] in {
+let Defs = [FPSCR_NZCV] in {
 def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
@@ -315,7 +323,7 @@ def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
 }
-} // Defs = [FPSCR]
+} // Defs = [FPSCR_NZCV]
 
 //===----------------------------------------------------------------------===//
 // FP Unary Operations.
@@ -335,7 +343,7 @@ def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
   let D = VFPNeonA8Domain;
 }
 
-let Defs = [FPSCR] in {
+let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
@@ -376,7 +384,7 @@ def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
 }
-} // Defs = [FPSCR]
+} // Defs = [FPSCR_NZCV]
 
 def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
@@ -810,7 +818,29 @@ let Constraints = "$a = $dst" in {
 
 // FP to Fixed-Point:
 
-def VTOSHS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 0,
+// Single Precision register
+class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, 
+                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+                list<dag> pattern>
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
+  bits<5> dst;
+  // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+  let Inst{22} = dst{0};
+  let Inst{15-12} = dst{4-1};
+}
+
+// Double Precision register
+class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5, 
+                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+                list<dag> pattern>
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
+  bits<5> dst;
+  // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+  let Inst{22} = dst{4};
+  let Inst{15-12} = dst{3-0};
+}
+
+def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> {
   // Some single precision VFP instructions may be executed on both NEON and
@@ -818,7 +848,7 @@ def VTOSHS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 0,
   let D = VFPNeonA8Domain;
 }
 
-def VTOUHS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 0,
+def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> {
   // Some single precision VFP instructions may be executed on both NEON and
@@ -826,7 +856,7 @@ def VTOUHS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 0,
   let D = VFPNeonA8Domain;
 }
 
-def VTOSLS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 1,
+def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> {
   // Some single precision VFP instructions may be executed on both NEON and
@@ -834,7 +864,7 @@ def VTOSLS : AVConv1XI<0b11101, 0b11, 0b1110, 0b1010, 1,
   let D = VFPNeonA8Domain;
 }
 
-def VTOULS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 1,
+def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> {
   // Some single precision VFP instructions may be executed on both NEON and
@@ -842,25 +872,25 @@ def VTOULS : AVConv1XI<0b11101, 0b11, 0b1111, 0b1010, 1,
   let D = VFPNeonA8Domain;
 }
 
-def VTOSHD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 0,
+def VTOSHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
                  IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>;
 
-def VTOUHD : AVConv1XI<0b11101, 0b11, 0b1111, 0b1011, 0,
+def VTOUHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
                  IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>;
 
-def VTOSLD : AVConv1XI<0b11101, 0b11, 0b1110, 0b1011, 1,
+def VTOSLD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
                  IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>;
 
-def VTOULD : AVConv1XI<0b11101, 0b11, 0b1111, 0b1011, 1,
+def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
                  IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>;
 
 // Fixed-Point to FP:
 
-def VSHTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 0,
+def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> {
   // Some single precision VFP instructions may be executed on both NEON and
@@ -868,7 +898,7 @@ def VSHTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 0,
   let D = VFPNeonA8Domain;
 }
 
-def VUHTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 0,
+def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []> {
   // Some single precision VFP instructions may be executed on both NEON and
@@ -876,7 +906,7 @@ def VUHTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 0,
   let D = VFPNeonA8Domain;
 }
 
-def VSLTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 1,
+def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []> {
   // Some single precision VFP instructions may be executed on both NEON and
@@ -884,7 +914,7 @@ def VSLTOS : AVConv1XI<0b11101, 0b11, 0b1010, 0b1010, 1,
   let D = VFPNeonA8Domain;
 }
 
-def VULTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 1,
+def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []> {
   // Some single precision VFP instructions may be executed on both NEON and
@@ -892,19 +922,19 @@ def VULTOS : AVConv1XI<0b11101, 0b11, 0b1011, 0b1010, 1,
   let D = VFPNeonA8Domain;
 }
 
-def VSHTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 0,
+def VSHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
                  IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>;
 
-def VUHTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 0,
+def VUHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
                  IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>;
 
-def VSLTOD : AVConv1XI<0b11101, 0b11, 0b1010, 0b1011, 1,
+def VSLTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
                  IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>;
 
-def VULTOD : AVConv1XI<0b11101, 0b11, 0b1011, 0b1011, 1,
+def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
                  IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>;
 
@@ -1166,9 +1196,9 @@ class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
 
 // APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
 // to APSR.
-let Defs = [CPSR], Uses = [FPSCR], Rt = 0b1111 /* apsr_nzcv */ in
+let Defs = [CPSR], Uses = [FPSCR_NZCV], Rt = 0b1111 /* apsr_nzcv */ in
 def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins),
-                        "vmrs", "\tapsr_nzcv, fpscr", [(arm_fmstat)]>;
+                        "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>;
 
 // Application level FPSCR -> GPR
 let hasSideEffects = 1, Uses = [FPSCR] in
@@ -1182,6 +1212,10 @@ let Uses = [FPSCR] in {
                               "vmrs", "\t$Rt, fpexc", []>;
   def VMRS_FPSID : MovFromVFP<0b0000 /* fpsid */, (outs GPR:$Rt), (ins),
                               "vmrs", "\t$Rt, fpsid", []>;
+  def VMRS_MVFR0 : MovFromVFP<0b0111 /* mvfr0 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr0", []>;
+  def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr1", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1304,6 +1338,13 @@ def : VFP2MnemonicAlias<"fcmps", "vcmp.f32">;
 def : VFP2MnemonicAlias<"fcmpd", "vcmp.f64">;
 def : VFP2MnemonicAlias<"fdivs", "vdiv.f32">;
 def : VFP2MnemonicAlias<"fdivd", "vdiv.f64">;
+def : VFP2MnemonicAlias<"fmrx", "vmrs">;
+def : VFP2MnemonicAlias<"fmxr", "vmsr">;
+
+// Be friendly and accept the old form of zero-compare
+def : VFP2InstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
+def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>;
+
 
 def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
 def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index afbe0e4..753e578 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -62,7 +62,7 @@ extern "C" {
     // concerned, so we can't just preserve the callee saved regs.
     "stmdb sp!, {r0, r1, r2, r3, lr}\n"
 #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
-    "fstmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+    "vstmdb sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
 #endif
     // The LR contains the address of the stub function on entry.
     // pass it as the argument to the C part of the callback
@@ -86,7 +86,7 @@ extern "C" {
     //
 #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
     // Restore VFP caller-saved registers.
-    "fldmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+    "vldmia sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
 #endif
     //
     //      We need to exchange the values in slots 0 and 1 so we can
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index 1f83762..6f3819a 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMRegisterInfo.h"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMRegisterInfo.h"
 using namespace llvm;
 
 void ARMRegisterInfo::anchor() { }
diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
index 65ed95d..8a24842 100644
--- a/lib/Target/ARM/ARMRegisterInfo.h
+++ b/lib/Target/ARM/ARMRegisterInfo.h
@@ -15,13 +15,12 @@
 #define ARMREGISTERINFO_H
 
 #include "ARM.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "ARMBaseRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
   class ARMSubtarget;
   class ARMBaseInstrInfo;
-  class Type;
 
 struct ARMRegisterInfo : public ARMBaseRegisterInfo {
   virtual void anchor();
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index b16a12c..1327fb8 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -153,14 +153,21 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 }
 
 // Current Program Status Register.
-def CPSR    : ARMReg<0, "cpsr">;
-def APSR    : ARMReg<1, "apsr">;
-def SPSR    : ARMReg<2, "spsr">;
-def FPSCR   : ARMReg<3, "fpscr">;
-def ITSTATE : ARMReg<4, "itstate">;
+// We model fpscr with two registers: FPSCR models the control bits and will be
+// reserved. FPSCR_NZCV models the flag bits and will be unreserved. 
+def CPSR       : ARMReg<0, "cpsr">;
+def APSR       : ARMReg<1, "apsr">;
+def SPSR       : ARMReg<2, "spsr">;
+def FPSCR      : ARMReg<3, "fpscr">;
+def FPSCR_NZCV : ARMReg<3, "fpscr_nzcv"> {
+  let Aliases = [FPSCR];
+}
+def ITSTATE    : ARMReg<4, "itstate">;
 
 // Special Registers - only available in privileged mode.
 def FPSID   : ARMReg<0, "fpsid">;
+def MVFR1   : ARMReg<6, "mvfr1">;
+def MVFR0   : ARMReg<7, "mvfr0">;
 def FPEXC   : ARMReg<8, "fpexc">;
 
 // Register classes.
@@ -304,7 +311,8 @@ def TuplesOE2D : RegisterTuples<[dsub_0, dsub_1],
 
 // Register class representing a pair of consecutive D registers.
 // Use the Q registers for the even-odd pairs.
-def DPair : RegisterClass<"ARM", [v2i64], 128, (interleave QPR, TuplesOE2D)> {
+def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                          128, (interleave QPR, TuplesOE2D)> {
   // Allocate starting at non-VFP2 registers D16-D31 first.
   let AltOrders = [(rotl DPair, 16)];
   let AltOrderSelect = [{ return 1; }];
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 2045482..911eb13 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -1101,13 +1101,8 @@ public:
     return VectorList.Count == 4;
   }
 
-  bool isVecListTwoQ() const {
-    if (!isDoubleSpacedVectorList()) return false;
-    return VectorList.Count == 2;
-  }
-
   bool isVecListDPairSpaced() const {
-    if (!isSingleSpacedVectorList()) return false;
+    if (isSingleSpacedVectorList()) return false;
     return (ARMMCRegisterClasses[ARM::DPairSpcRegClassID]
               .contains(VectorList.RegNum));
   }
@@ -1133,12 +1128,13 @@ public:
     return VectorList.Count == 1;
   }
 
-  bool isVecListTwoDAllLanes() const {
+  bool isVecListDPairAllLanes() const {
     if (!isSingleSpacedVectorAllLanes()) return false;
-    return VectorList.Count == 2;
+    return (ARMMCRegisterClasses[ARM::DPairRegClassID]
+              .contains(VectorList.RegNum));
   }
 
-  bool isVecListTwoQAllLanes() const {
+  bool isVecListDPairSpacedAllLanes() const {
     if (!isDoubleSpacedVectorAllLanes()) return false;
     return VectorList.Count == 2;
   }
@@ -2858,8 +2854,12 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     if (!RC->contains(Reg))
       return Error(RegLoc, "invalid register in register list");
     // List must be monotonically increasing.
-    if (getARMRegisterNumbering(Reg) < getARMRegisterNumbering(OldReg))
-      return Error(RegLoc, "register list not in ascending order");
+    if (getARMRegisterNumbering(Reg) < getARMRegisterNumbering(OldReg)) {
+      if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+        Warning(RegLoc, "register list not in ascending order");
+      else
+        return Error(RegLoc, "register list not in ascending order");
+    }
     if (getARMRegisterNumbering(Reg) == getARMRegisterNumbering(OldReg)) {
       Warning(RegLoc, "duplicated register (" + RegTok.getString() +
               ") in register list");
@@ -2905,6 +2905,12 @@ parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index) {
       Parser.Lex(); // Eat the ']'.
       return MatchOperand_Success;
     }
+
+    // There's an optional '#' token here. Normally there wouldn't be, but
+    // inline assemble puts one in, and it's friendly to accept that.
+    if (Parser.getTok().is(AsmToken::Hash))
+      Parser.Lex(); // Eat the '#'
+
     const MCExpr *LaneIndex;
     SMLoc Loc = Parser.getTok().getLoc();
     if (getParser().ParseExpression(LaneIndex)) {
@@ -2981,12 +2987,13 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       case NoLanes:
         E = Parser.getTok().getLoc();
         Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
-                                      &ARMMCRegisterClasses[ARM::DPairRegClassID]);
-
+                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
         Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, false, S, E));
         break;
       case AllLanes:
         E = Parser.getTok().getLoc();
+        Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
+                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
         Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 2, false,
                                                                 S, E));
         break;
@@ -3152,7 +3159,7 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   switch (LaneKind) {
   case NoLanes:
-    // Non-lane two-register operands have been converted to the
+    // Two-register operands have been converted to the
     // composite register classes.
     if (Count == 2) {
       const MCRegisterClass *RC = (Spacing == 1) ?
@@ -3165,6 +3172,14 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
                                                     (Spacing == 2), S, E));
     break;
   case AllLanes:
+    // Two-register operands have been converted to the
+    // composite register classes.
+    if (Count == 2) {
+      const MCRegisterClass *RC = (Spacing == 1) ?
+        &ARMMCRegisterClasses[ARM::DPairRegClassID] :
+        &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
+      FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
+    }
     Operands.push_back(ARMOperand::CreateVectorListAllLanes(FirstReg, Count,
                                                             (Spacing == 2),
                                                             S, E));
@@ -3253,7 +3268,8 @@ parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   if (isMClass()) {
     // See ARMv6-M 10.1.1
-    unsigned FlagsVal = StringSwitch<unsigned>(Mask)
+    std::string Name = Mask.lower();
+    unsigned FlagsVal = StringSwitch<unsigned>(Name)
       .Case("apsr", 0)
       .Case("iapsr", 1)
       .Case("eapsr", 2)
@@ -4427,10 +4443,11 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     else if (Res == -1) // irrecoverable error
       return true;
     // If this is VMRS, check for the apsr_nzcv operand.
-    if (Mnemonic == "vmrs" && Parser.getTok().getString() == "apsr_nzcv") {
+    if (Mnemonic == "vmrs" &&
+        Parser.getTok().getString().equals_lower("apsr_nzcv")) {
       S = Parser.getTok().getLoc();
       Parser.Lex();
-      Operands.push_back(ARMOperand::CreateToken("apsr_nzcv", S));
+      Operands.push_back(ARMOperand::CreateToken("APSR_nzcv", S));
       return false;
     }
 
@@ -4598,7 +4615,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
         Mnemonic == "vrsqrts" || Mnemonic == "srs" || Mnemonic == "flds" ||
         Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
         Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
-        Mnemonic == "fmuls" || Mnemonic == "fcmps" ||
+        Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
         (Mnemonic == "movs" && isThumb()))) {
     Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
     CarrySetting = true;
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 4101f59..ce4587b 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -849,7 +849,7 @@ extern "C" void LLVMInitializeARMDisassembler() {
                                          createThumbDisassembler);
 }
 
-static const unsigned GPRDecoderTable[] = {
+static const uint16_t GPRDecoderTable[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3,
   ARM::R4, ARM::R5, ARM::R6, ARM::R7,
   ARM::R8, ARM::R9, ARM::R10, ARM::R11,
@@ -869,8 +869,14 @@ static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
 static DecodeStatus
 DecodeGPRnopcRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                            uint64_t Address, const void *Decoder) {
-  if (RegNo == 15) return MCDisassembler::Fail;
-  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+  DecodeStatus S = MCDisassembler::Success;
+  
+  if (RegNo == 15) 
+    S = MCDisassembler::SoftFail;
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+
+  return S;
 }
 
 static DecodeStatus DecodetGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
@@ -916,7 +922,7 @@ static DecodeStatus DecoderGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-static const unsigned SPRDecoderTable[] = {
+static const uint16_t SPRDecoderTable[] = {
      ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
      ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
      ARM::S8,  ARM::S9, ARM::S10, ARM::S11,
@@ -937,7 +943,7 @@ static DecodeStatus DecodeSPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static const unsigned DPRDecoderTable[] = {
+static const uint16_t DPRDecoderTable[] = {
      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
      ARM::D8,  ARM::D9, ARM::D10, ARM::D11,
@@ -973,7 +979,7 @@ DecodeDPR_VFP2RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-static const unsigned QPRDecoderTable[] = {
+static const uint16_t QPRDecoderTable[] = {
      ARM::Q0,  ARM::Q1,  ARM::Q2,  ARM::Q3,
      ARM::Q4,  ARM::Q5,  ARM::Q6,  ARM::Q7,
      ARM::Q8,  ARM::Q9, ARM::Q10, ARM::Q11,
@@ -992,7 +998,7 @@ static DecodeStatus DecodeQPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static const unsigned DPairDecoderTable[] = {
+static const uint16_t DPairDecoderTable[] = {
   ARM::Q0,  ARM::D1_D2,   ARM::Q1,  ARM::D3_D4,   ARM::Q2,  ARM::D5_D6,
   ARM::Q3,  ARM::D7_D8,   ARM::Q4,  ARM::D9_D10,  ARM::Q5,  ARM::D11_D12,
   ARM::Q6,  ARM::D13_D14, ARM::Q7,  ARM::D15_D16, ARM::Q8,  ARM::D17_D18,
@@ -1011,7 +1017,7 @@ static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static const unsigned DPairSpacedDecoderTable[] = {
+static const uint16_t DPairSpacedDecoderTable[] = {
   ARM::D0_D2,   ARM::D1_D3,   ARM::D2_D4,   ARM::D3_D5,
   ARM::D4_D6,   ARM::D5_D7,   ARM::D6_D8,   ARM::D7_D9,
   ARM::D8_D10,  ARM::D9_D11,  ARM::D10_D12, ARM::D11_D13,
@@ -2001,27 +2007,15 @@ static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Insn,
 
   // First output register
   switch (Inst.getOpcode()) {
-  case ARM::VLD1q16:
-  case ARM::VLD1q32:
-  case ARM::VLD1q64:
-  case ARM::VLD1q8:
-  case ARM::VLD1q16wb_fixed:
-  case ARM::VLD1q16wb_register:
-  case ARM::VLD1q32wb_fixed:
-  case ARM::VLD1q32wb_register:
-  case ARM::VLD1q64wb_fixed:
-  case ARM::VLD1q64wb_register:
-  case ARM::VLD1q8wb_fixed:
-  case ARM::VLD1q8wb_register:
-  case ARM::VLD2d16:
-  case ARM::VLD2d32:
-  case ARM::VLD2d8:
-  case ARM::VLD2d16wb_fixed:
-  case ARM::VLD2d16wb_register:
-  case ARM::VLD2d32wb_fixed:
-  case ARM::VLD2d32wb_register:
-  case ARM::VLD2d8wb_fixed:
-  case ARM::VLD2d8wb_register:
+  case ARM::VLD1q16: case ARM::VLD1q32: case ARM::VLD1q64: case ARM::VLD1q8:
+  case ARM::VLD1q16wb_fixed: case ARM::VLD1q16wb_register:
+  case ARM::VLD1q32wb_fixed: case ARM::VLD1q32wb_register:
+  case ARM::VLD1q64wb_fixed: case ARM::VLD1q64wb_register:
+  case ARM::VLD1q8wb_fixed: case ARM::VLD1q8wb_register:
+  case ARM::VLD2d16: case ARM::VLD2d32: case ARM::VLD2d8:
+  case ARM::VLD2d16wb_fixed: case ARM::VLD2d16wb_register:
+  case ARM::VLD2d32wb_fixed: case ARM::VLD2d32wb_register:
+  case ARM::VLD2d8wb_fixed: case ARM::VLD2d8wb_register:
     if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
       return MCDisassembler::Fail;
     break;
@@ -2325,6 +2319,8 @@ static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
     case ARM::VST2b8wb_register:
     case ARM::VST2b16wb_register:
     case ARM::VST2b32wb_register:
+      Inst.addOperand(MCOperand::CreateImm(0));
+      break;
     case ARM::VST3d8_UPD:
     case ARM::VST3d16_UPD:
     case ARM::VST3d32_UPD:
@@ -2366,6 +2362,23 @@ static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
     case ARM::VST1q16wb_fixed:
     case ARM::VST1q32wb_fixed:
     case ARM::VST1q64wb_fixed:
+    case ARM::VST1d8Twb_fixed:
+    case ARM::VST1d16Twb_fixed:
+    case ARM::VST1d32Twb_fixed:
+    case ARM::VST1d64Twb_fixed:
+    case ARM::VST1d8Qwb_fixed:
+    case ARM::VST1d16Qwb_fixed:
+    case ARM::VST1d32Qwb_fixed:
+    case ARM::VST1d64Qwb_fixed:
+    case ARM::VST2d8wb_fixed:
+    case ARM::VST2d16wb_fixed:
+    case ARM::VST2d32wb_fixed:
+    case ARM::VST2q8wb_fixed:
+    case ARM::VST2q16wb_fixed:
+    case ARM::VST2q32wb_fixed:
+    case ARM::VST2b8wb_fixed:
+    case ARM::VST2b16wb_fixed:
+    case ARM::VST2b32wb_fixed:
       break;
   }
 
@@ -2525,8 +2538,19 @@ static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Insn,
 
   align *= (1 << size);
 
-  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
-    return MCDisassembler::Fail;
+  switch (Inst.getOpcode()) {
+  case ARM::VLD1DUPq16: case ARM::VLD1DUPq32: case ARM::VLD1DUPq8:
+  case ARM::VLD1DUPq16wb_fixed: case ARM::VLD1DUPq16wb_register:
+  case ARM::VLD1DUPq32wb_fixed: case ARM::VLD1DUPq32wb_register:
+  case ARM::VLD1DUPq8wb_fixed: case ARM::VLD1DUPq8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  }
   if (Rm != 0xF) {
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -2556,18 +2580,33 @@ static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Insn,
   unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
   unsigned align = fieldFromInstruction32(Insn, 4, 1);
   unsigned size = 1 << fieldFromInstruction32(Insn, 6, 2);
-  unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1;
+  unsigned pred = fieldFromInstruction32(Insn, 22, 4);
   align *= 2*size;
 
-  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (Rm != 0xF) {
-    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+  switch (Inst.getOpcode()) {
+  case ARM::VLD2DUPd16: case ARM::VLD2DUPd32: case ARM::VLD2DUPd8:
+  case ARM::VLD2DUPd16wb_fixed: case ARM::VLD2DUPd16wb_register:
+  case ARM::VLD2DUPd32wb_fixed: case ARM::VLD2DUPd32wb_register:
+  case ARM::VLD2DUPd8wb_fixed: case ARM::VLD2DUPd8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2DUPd16x2: case ARM::VLD2DUPd32x2: case ARM::VLD2DUPd8x2:
+  case ARM::VLD2DUPd16x2wb_fixed: case ARM::VLD2DUPd16x2wb_register:
+  case ARM::VLD2DUPd32x2wb_fixed: case ARM::VLD2DUPd32x2wb_register:
+  case ARM::VLD2DUPd8x2wb_fixed: case ARM::VLD2DUPd8x2wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
       return MCDisassembler::Fail;
+    break;
   }
 
+  if (Rm != 0xF)
+    Inst.addOperand(MCOperand::CreateImm(0));
+
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateImm(align));
@@ -2579,6 +2618,9 @@ static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Insn,
       return MCDisassembler::Fail;
   }
 
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
   return S;
 }
 
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index bae4e78..2b994df 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -1026,15 +1026,6 @@ void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum,
 }
 
 void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  // Normally, it's not safe to use register enum values directly with
-  // addition to get the next register, but for VFP registers, the
-  // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 1) << "}";
-}
-
-void ARMInstPrinter::printVectorListDPair(const MCInst *MI, unsigned OpNum,
                                           raw_ostream &O) {
   unsigned Reg = MI->getOperand(OpNum).getReg();
   unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
@@ -1042,9 +1033,9 @@ void ARMInstPrinter::printVectorListDPair(const MCInst *MI, unsigned OpNum,
   O << "{" << getRegisterName(Reg0) << ", " << getRegisterName(Reg1) << "}";
 }
 
-void ARMInstPrinter::printVectorListDPairSpaced(const MCInst *MI,
-                                                unsigned OpNum,
-                                                raw_ostream &O) {
+void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI,
+                                              unsigned OpNum,
+                                              raw_ostream &O) {
   unsigned Reg = MI->getOperand(OpNum).getReg();
   unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
   unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
@@ -1081,11 +1072,10 @@ void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI,
 void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI,
                                                 unsigned OpNum,
                                                 raw_ostream &O) {
-  // Normally, it's not safe to use register enum values directly with
-  // addition to get the next register, but for VFP registers, the
-  // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 1) << "[]}";
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
+  O << "{" << getRegisterName(Reg0) << "[], " << getRegisterName(Reg1) << "[]}";
 }
 
 void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI,
@@ -1111,23 +1101,13 @@ void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI,
     << getRegisterName(MI->getOperand(OpNum).getReg() + 3) << "[]}";
 }
 
-void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  // Normally, it's not safe to use register enum values directly with
-  // addition to get the next register, but for VFP registers, the
-  // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "}";
-}
-
 void ARMInstPrinter::printVectorListTwoSpacedAllLanes(const MCInst *MI,
                                                       unsigned OpNum,
                                                       raw_ostream &O) {
-  // Normally, it's not safe to use register enum values directly with
-  // addition to get the next register, but for VFP registers, the
-  // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "[]}";
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
+  O << "{" << getRegisterName(Reg0) << "[], " << getRegisterName(Reg1) << "[]}";
 }
 
 void ARMInstPrinter::printVectorListThreeSpacedAllLanes(const MCInst *MI,
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 1037161..e9cd407 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -134,9 +134,8 @@ public:
   void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVectorListOne(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVectorListTwo(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printVectorListDPair(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printVectorListDPairSpaced(const MCInst *MI, unsigned OpNum,
-                                  raw_ostream &O);
+  void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
   void printVectorListThree(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVectorListFour(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVectorListOneAllLanes(const MCInst *MI, unsigned OpNum,
@@ -147,8 +146,6 @@ public:
                                     raw_ostream &O);
   void printVectorListFourAllLanes(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
-  void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
-                                raw_ostream &O);
   void printVectorListTwoSpacedAllLanes(const MCInst *MI, unsigned OpNum,
                                         raw_ostream &O);
   void printVectorListThreeSpacedAllLanes(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index d3a3d3a..25849ee 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -167,6 +167,7 @@ static unsigned getRelaxedOpcode(unsigned Op) {
   case ARM::tBcc:       return ARM::t2Bcc;
   case ARM::tLDRpciASM: return ARM::t2LDRpci;
   case ARM::tADR:       return ARM::t2ADR;
+  case ARM::tB:         return ARM::t2B;
   }
 }
 
@@ -181,6 +182,16 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
                                          const MCInstFragment *DF,
                                          const MCAsmLayout &Layout) const {
   switch ((unsigned)Fixup.getKind()) {
+  case ARM::fixup_arm_thumb_br: {
+    // Relaxing tB to t2B. tB has a signed 12-bit displacement with the
+    // low bit being an implied zero. There's an implied +4 offset for the
+    // branch, so we adjust the other way here to determine what's
+    // encodable.
+    //
+    // Relax if the value is too big for a (signed) i8.
+    int64_t Offset = int64_t(Value) - 4;
+    return Offset > 2046 || Offset < -2048;
+  }
   case ARM::fixup_arm_thumb_bcc: {
     // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the
     // low bit being an implied zero. There's an implied +4 offset for the
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 06eb4e5..ae11be8 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -187,21 +187,37 @@ inline static unsigned getARMRegisterNumbering(unsigned Reg) {
   case S31: case D31: return 31;
 
   // Composite registers use the regnum of the first register in the list.
-  case D1_D2:   return 1;
-  case D3_D5:   return 3;
-  case D5_D7:   return 5;
-  case D7_D9:   return 7;
-  case D9_D10:  return 9;
-  case D11_D12: return 11;
-  case D13_D14: return 13;
-  case D15_D16: return 15;
-  case D17_D18: return 17;
-  case D19_D20: return 19;
-  case D21_D22: return 21;
-  case D23_D24: return 23;
-  case D25_D26: return 25;
-  case D27_D28: return 27;
-  case D29_D30: return 29;
+  /* Q0  */     case D0_D2:   return 0;
+  case D1_D2:   case D1_D3:   return 1;
+  /* Q1  */     case D2_D4:   return 2;
+  case D3_D4:   case D3_D5:   return 3;
+  /* Q2  */     case D4_D6:   return 4;
+  case D5_D6:   case D5_D7:   return 5;
+  /* Q3  */     case D6_D8:   return 6;
+  case D7_D8:   case D7_D9:   return 7;
+  /* Q4  */     case D8_D10:  return 8;
+  case D9_D10:  case D9_D11:  return 9;
+  /* Q5  */     case D10_D12: return 10;
+  case D11_D12: case D11_D13: return 11;
+  /* Q6  */     case D12_D14: return 12;
+  case D13_D14: case D13_D15: return 13;
+  /* Q7  */     case D14_D16: return 14;
+  case D15_D16: case D15_D17: return 15;
+  /* Q8  */     case D16_D18: return 16;
+  case D17_D18: case D17_D19: return 17;
+  /* Q9  */     case D18_D20: return 18;
+  case D19_D20: case D19_D21: return 19;
+  /* Q10 */     case D20_D22: return 20;
+  case D21_D22: case D21_D23: return 21;
+  /* Q11 */     case D22_D24: return 22;
+  case D23_D24: case D23_D25: return 23;
+  /* Q12 */     case D24_D26: return 24;
+  case D25_D26: case D25_D27: return 25;
+  /* Q13 */     case D26_D28: return 26;
+  case D27_D28: case D27_D29: return 27;
+  /* Q14 */     case D28_D30: return 28;
+  case D29_D30: case D29_D31: return 29;
+  /* Q15 */
   }
 }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 1606b92..ed27f9f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -151,13 +151,13 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin())
-    return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+    return createMachOStreamer(Ctx, MAB, OS, Emitter, false);
 
   if (TheTriple.isOSWindows()) {
     llvm_unreachable("ARM does not support Windows COFF format");
   }
 
-  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  return createELFStreamer(Ctx, MAB, OS, Emitter, false, NoExecStack);
 }
 
 static MCInstPrinter *createARMMCInstPrinter(const Target &T,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index faf73ac..9d3da14 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -34,12 +34,12 @@ class ARMMachObjectWriter : public MCMachObjectTargetWriter {
                                     MCValue Target,
                                     unsigned Log2Size,
                                     uint64_t &FixedValue);
-  void RecordARMMovwMovtRelocation(MachObjectWriter *Writer,
-                                   const MCAssembler &Asm,
-                                   const MCAsmLayout &Layout,
-                                   const MCFragment *Fragment,
-                                   const MCFixup &Fixup, MCValue Target,
-                                   uint64_t &FixedValue);
+  void RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+                                        const MCAssembler &Asm,
+                                        const MCAsmLayout &Layout,
+                                        const MCFragment *Fragment,
+                                        const MCFixup &Fixup, MCValue Target,
+                                        uint64_t &FixedValue);
 
 public:
   ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
@@ -102,34 +102,47 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
     Log2Size = llvm::Log2_32(4);
     return true;
 
+  // For movw/movt r_type relocations they always have a pair following them and
+  // the r_length bits are used differently.  The encoding of the r_length is as
+  // follows:
+  //   low bit of r_length:
+  //      0 - :lower16: for movw instructions
+  //      1 - :upper16: for movt instructions
+  //   high bit of r_length:
+  //      0 - arm instructions
+  //      1 - thumb instructions
   case ARM::fixup_arm_movt_hi16:
   case ARM::fixup_arm_movt_hi16_pcrel:
+    RelocType = unsigned(macho::RIT_ARM_Half);
+    Log2Size = 1;
+    return true;
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movt_hi16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_HalfDifference);
-    // Report as 'long', even though that is not quite accurate.
-    Log2Size = llvm::Log2_32(4);
+    RelocType = unsigned(macho::RIT_ARM_Half);
+    Log2Size = 3;
     return true;
 
   case ARM::fixup_arm_movw_lo16:
   case ARM::fixup_arm_movw_lo16_pcrel:
+    RelocType = unsigned(macho::RIT_ARM_Half);
+    Log2Size = 0;
+    return true;
   case ARM::fixup_t2_movw_lo16:
   case ARM::fixup_t2_movw_lo16_pcrel:
     RelocType = unsigned(macho::RIT_ARM_Half);
-    // Report as 'long', even though that is not quite accurate.
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = 2;
     return true;
   }
 }
 
 void ARMMachObjectWriter::
-RecordARMMovwMovtRelocation(MachObjectWriter *Writer,
-                            const MCAssembler &Asm,
-                            const MCAsmLayout &Layout,
-                            const MCFragment *Fragment,
-                            const MCFixup &Fixup,
-                            MCValue Target,
-                            uint64_t &FixedValue) {
+RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup,
+                                 MCValue Target,
+                                 uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned Type = macho::RIT_ARM_Half;
@@ -313,10 +326,9 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   // scattered relocation entry.  Differences always require scattered
   // relocations.
   if (Target.getSymB()) {
-    if (RelocType == macho::RIT_ARM_Half ||
-        RelocType == macho::RIT_ARM_HalfDifference)
-      return RecordARMMovwMovtRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                         Target, FixedValue);
+    if (RelocType == macho::RIT_ARM_Half)
+      return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment,
+                                              Fixup, Target, FixedValue);
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
                                         Target, Log2Size, FixedValue);
   }
@@ -391,6 +403,30 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
                (Log2Size  << 25) |
                (IsExtern  << 27) |
                (Type      << 28));
+
+  // Even when it's not a scattered relocation, movw/movt always uses
+  // a PAIR relocation.
+  if (Type == macho::RIT_ARM_Half) {
+    // The other-half value only gets populated for the movt relocation.
+    uint32_t Value = 0;;
+    switch ((unsigned)Fixup.getKind()) {
+    default: break;
+    case ARM::fixup_arm_movt_hi16:
+    case ARM::fixup_arm_movt_hi16_pcrel:
+    case ARM::fixup_t2_movt_hi16:
+    case ARM::fixup_t2_movt_hi16_pcrel:
+      Value = FixedValue;
+      break;
+    }
+    macho::RelocationEntry MREPair;
+    MREPair.Word0 = Value;
+    MREPair.Word1 = ((0xffffff) |
+                     (Log2Size << 25) |
+                     (macho::RIT_Pair << 28));
+
+    Writer->addRelocation(Fragment->getParent(), MREPair);
+  }
+
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index a89a663..edd73c2 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "Thumb1FrameLowering.h"
-#include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index adaccdd..8cf7cac 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -13,7 +13,6 @@
 
 #include "Thumb1InstrInfo.h"
 #include "ARM.h"
-#include "ARMMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index 4d97626..27fce9b 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -14,10 +14,10 @@
 #ifndef THUMB1INSTRUCTIONINFO_H
 #define THUMB1INSTRUCTIONINFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
 #include "ARM.h"
-#include "ARMInstrInfo.h"
+#include "ARMBaseInstrInfo.h"
 #include "Thumb1RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
   class ARMSubtarget;
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 6b8bf0e..ef77bbd 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -12,12 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Thumb1RegisterInfo.h"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
-#include "Thumb1InstrInfo.h"
-#include "Thumb1RegisterInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 9060e59..6971842 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -16,13 +16,12 @@
 #define THUMB1REGISTERINFO_H
 
 #include "ARM.h"
-#include "ARMRegisterInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
   class ARMSubtarget;
   class ARMBaseInstrInfo;
-  class Type;
 
 struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
 public:
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 6cb182a..2fe4b85 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -15,7 +15,6 @@
 #include "ARM.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
-#include "Thumb2InstrInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index a754649..1ae2ef1 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -14,10 +14,10 @@
 #ifndef THUMB2INSTRUCTIONINFO_H
 #define THUMB2INSTRUCTIONINFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
 #include "ARM.h"
 #include "ARMInstrInfo.h"
 #include "Thumb2RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
 class ARMSubtarget;
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 6d210fe..29a87d0 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -12,10 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Thumb2RegisterInfo.h"
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
-#include "Thumb2InstrInfo.h"
-#include "Thumb2RegisterInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h
index 824378a..6b397e8 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.h
+++ b/lib/Target/ARM/Thumb2RegisterInfo.h
@@ -16,13 +16,12 @@
 #define THUMB2REGISTERINFO_H
 
 #include "ARM.h"
-#include "ARMRegisterInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
   class ARMSubtarget;
   class ARMBaseInstrInfo;
-  class Type;
 
 struct Thumb2RegisterInfo : public ARMBaseRegisterInfo {
 public:
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 5ee5f42..fb9d93b 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -39,9 +39,9 @@ namespace {
   /// ReduceTable - A static table with information on mapping from wide
   /// opcodes to narrow
   struct ReduceEntry {
-    unsigned WideOpc;      // Wide opcode
-    unsigned NarrowOpc1;   // Narrow opcode to transform to
-    unsigned NarrowOpc2;   // Narrow opcode when it's two-address
+    uint16_t WideOpc;      // Wide opcode
+    uint16_t NarrowOpc1;   // Narrow opcode to transform to
+    uint16_t NarrowOpc2;   // Narrow opcode when it's two-address
     uint8_t  Imm1Limit;    // Limit of immediate field (bits)
     uint8_t  Imm2Limit;    // Limit of immediate field when it's two-address
     unsigned LowRegs1 : 1; // Only possible if low-registers are used
@@ -189,7 +189,7 @@ Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(ID) {
 }
 
 static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
-  for (const unsigned *Regs = MCID.ImplicitDefs; *Regs; ++Regs)
+  for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
     if (*Regs == ARM::CPSR)
       return true;
   return false;
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 80973b7..b6b209e 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -2392,17 +2392,17 @@ void CWriter::visitSwitchInst(SwitchInst &SI) {
   printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
   Out << ";\n";
 
-  unsigned NumCases = SI.getNumCases();
   // Skip the first item since that's the default case.
-  for (unsigned i = 0; i < NumCases; ++i) {
-    ConstantInt* CaseVal = SI.getCaseValue(i);
-    BasicBlock* Succ = SI.getCaseSuccessor(i);
+  for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
+    ConstantInt* CaseVal = i.getCaseValue();
+    BasicBlock* Succ = i.getCaseSuccessor();
     Out << "  case ";
     writeOperand(CaseVal);
     Out << ":\n";
     printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
     printBranchToBlock(SI.getParent(), Succ, 2);
-    if (Function::iterator(Succ) == llvm::next(Function::iterator(SI.getParent())))
+    if (Function::iterator(Succ) ==
+        llvm::next(Function::iterator(SI.getParent())))
       Out << "    break;\n";
   }
 
diff --git a/lib/Target/CellSPU/SPUFrameLowering.cpp b/lib/Target/CellSPU/SPUFrameLowering.cpp
index 916f9ba..fac806e 100644
--- a/lib/Target/CellSPU/SPUFrameLowering.cpp
+++ b/lib/Target/CellSPU/SPUFrameLowering.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SPU.h"
 #include "SPUFrameLowering.h"
+#include "SPU.h"
 #include "SPUInstrBuilder.h"
 #include "SPUInstrInfo.h"
 #include "llvm/Function.h"
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 3d2b32d..55b3f72 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -31,14 +31,10 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include <map>
 
 using namespace llvm;
 
-// Used in getTargetNodeName() below
 namespace {
-  std::map<unsigned, const char *> node_names;
-
   // Byte offset of the preferred slot (counted from the MSB)
   int prefslotOffset(EVT VT) {
     int retval=0;
@@ -481,40 +477,34 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setSchedulingPreference(Sched::RegPressure);
 }
 
-const char *
-SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
-{
-  if (node_names.empty()) {
-    node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
-    node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
-    node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
-    node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
-    node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
-    node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
-    node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
-    node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
-    node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
-    node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
-    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
-    node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
-    node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
-    node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS";
-    node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES";
-    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
-    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
-    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
-    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
-            "SPUISD::ROTBYTES_LEFT_BITS";
-    node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
-    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
-    node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
-    node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
-    node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
-  }
-
-  std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
-
-  return ((i != node_names.end()) ? i->second : 0);
+const char *SPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case SPUISD::RET_FLAG: return "SPUISD::RET_FLAG";
+  case SPUISD::Hi: return "SPUISD::Hi";
+  case SPUISD::Lo: return "SPUISD::Lo";
+  case SPUISD::PCRelAddr: return "SPUISD::PCRelAddr";
+  case SPUISD::AFormAddr: return "SPUISD::AFormAddr";
+  case SPUISD::IndirectAddr: return "SPUISD::IndirectAddr";
+  case SPUISD::LDRESULT: return "SPUISD::LDRESULT";
+  case SPUISD::CALL: return "SPUISD::CALL";
+  case SPUISD::SHUFB: return "SPUISD::SHUFB";
+  case SPUISD::SHUFFLE_MASK: return "SPUISD::SHUFFLE_MASK";
+  case SPUISD::CNTB: return "SPUISD::CNTB";
+  case SPUISD::PREFSLOT2VEC: return "SPUISD::PREFSLOT2VEC";
+  case SPUISD::VEC2PREFSLOT: return "SPUISD::VEC2PREFSLOT";
+  case SPUISD::SHL_BITS: return "SPUISD::SHL_BITS";
+  case SPUISD::SHL_BYTES: return "SPUISD::SHL_BYTES";
+  case SPUISD::VEC_ROTL: return "SPUISD::VEC_ROTL";
+  case SPUISD::VEC_ROTR: return "SPUISD::VEC_ROTR";
+  case SPUISD::ROTBYTES_LEFT: return "SPUISD::ROTBYTES_LEFT";
+  case SPUISD::ROTBYTES_LEFT_BITS: return "SPUISD::ROTBYTES_LEFT_BITS";
+  case SPUISD::SELECT_MASK: return "SPUISD::SELECT_MASK";
+  case SPUISD::SELB: return "SPUISD::SELB";
+  case SPUISD::ADD64_MARKER: return "SPUISD::ADD64_MARKER";
+  case SPUISD::SUB64_MARKER: return "SPUISD::SUB64_MARKER";
+  case SPUISD::MUL64_MARKER: return "SPUISD::MUL64_MARKER";
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1216,7 +1206,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
   if (isVarArg) {
     // FIXME: we should be able to query the argument registers from
     //        tablegen generated code.
-    static const unsigned ArgRegs[] = {
+    static const uint16_t ArgRegs[] = {
       SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
       SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
       SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
@@ -1230,7 +1220,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
       SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
     };
     // size of ArgRegs array
-    unsigned NumArgRegs = 77;
+    const unsigned NumArgRegs = 77;
 
     // We will spill (79-3)+1 registers to the stack
     SmallVector<SDValue, 79-3+1> MemOps;
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index e28e2a4..25c5355 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -15,9 +15,9 @@
 #ifndef SPU_ISELLOWERING_H
 #define SPU_ISELLOWERING_H
 
+#include "SPU.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "SPU.h"
 
 namespace llvm {
   namespace SPUISD {
diff --git a/lib/Target/CellSPU/SPUInstrInfo.h b/lib/Target/CellSPU/SPUInstrInfo.h
index f0d21ad..85e5821 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.h
+++ b/lib/Target/CellSPU/SPUInstrInfo.h
@@ -15,8 +15,8 @@
 #define SPU_INSTRUCTIONINFO_H
 
 #include "SPU.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "SPURegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "SPUGenInstrInfo.inc"
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index 92983e1..1b2da5f 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "reginfo"
-#include "SPU.h"
 #include "SPURegisterInfo.h"
+#include "SPU.h"
 #include "SPUInstrBuilder.h"
 #include "SPUSubtarget.h"
 #include "SPUMachineFunction.h"
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index e43f5ad..21f6b25 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SPU.h"
 #include "SPUTargetMachine.h"
+#include "SPU.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/Support/DynamicLibrary.h"
diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
index c179292..3e5d38c 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.h
+++ b/lib/Target/CellSPU/SPUTargetMachine.h
@@ -23,9 +23,6 @@
 #include "llvm/Target/TargetData.h"
 
 namespace llvm {
-class PassManager;
-class GlobalValue;
-class TargetFrameLowering;
 
 /// SPUTargetMachine
 ///
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 76b5e9c..107c6cc 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -1090,10 +1090,10 @@ void CppWriter::printInstruction(const Instruction *I,
         << getOpName(SI->getDefaultDest()) << ", "
         << SI->getNumCases() << ", " << bbname << ");";
     nl(Out);
-    unsigned NumCases = SI->getNumCases();
-    for (unsigned i = 0; i < NumCases; ++i) {
-      const ConstantInt* CaseVal = SI->getCaseValue(i);
-      const BasicBlock *BB = SI->getCaseSuccessor(i);
+    for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      const ConstantInt* CaseVal = i.getCaseValue();
+      const BasicBlock *BB = i.getCaseSuccessor();
       Out << iName << "->addCase("
           << getOpName(CaseVal) << ", "
           << getOpName(BB) << ");";
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index bbefcaf..270c7a7 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -15,7 +15,6 @@
 #ifndef TARGET_Hexagon_H
 #define TARGET_Hexagon_H
 
-#include <cassert>
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/Target/TargetLowering.h"
 
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 688b8e3..bf333b7 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -32,11 +32,11 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -46,8 +46,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
index 71787de..46c20e9 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.cpp
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
@@ -14,13 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "HexagonCallingConvLower.h"
+#include "Hexagon.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "Hexagon.h"
 using namespace llvm;
 
 Hexagon_CCState::Hexagon_CCState(CallingConv::ID CC, bool isVarArg,
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 49c6cdf..e8a6924 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -8,13 +8,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "HexagonFrameLowering.h"
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
 #include "HexagonMachineFunctionInfo.h"
-#include "HexagonFrameLowering.h"
 #include "llvm/Function.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/BitVector.h"
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 04ea4ed..57772a5 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -27,6 +27,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "hwloops"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
 #include "llvm/Constants.h"
 #include "llvm/PassSupport.h"
 #include "llvm/ADT/DenseMap.h"
@@ -43,8 +45,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include <algorithm>
-#include "Hexagon.h"
-#include "HexagonTargetMachine.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index ed4b840..d6da0d0 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -28,17 +28,16 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "HexagonMachineFunctionInfo.h"
 #include "llvm/Support/CommandLine.h"
+using namespace llvm;
 
 const unsigned Hexagon_MAX_RET_SIZE = 64;
-using namespace llvm;
 
 static cl::opt<bool>
 EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
@@ -159,7 +158,7 @@ static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
                          MVT LocVT, CCValAssign::LocInfo LocInfo,
                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
 
-  static const unsigned RegList[] = {
+  static const uint16_t RegList[] = {
     Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
     Hexagon::R5
   };
@@ -182,10 +181,10 @@ static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
     return false;
   }
 
-  static const unsigned RegList1[] = {
+  static const uint16_t RegList1[] = {
     Hexagon::D1, Hexagon::D2
   };
-  static const unsigned RegList2[] = {
+  static const uint16_t RegList2[] = {
     Hexagon::R1, Hexagon::R3
   };
   if (unsigned Reg = State.AllocateReg(RegList1, RegList2, 2)) {
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 5396486..4208bcb 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -15,10 +15,10 @@
 #ifndef Hexagon_ISELLOWERING_H
 #define Hexagon_ISELLOWERING_H
 
+#include "Hexagon.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/CallingConv.h"
 #include "llvm/CodeGen/CallingConvLower.h"
-#include "Hexagon.h"
 
 namespace llvm {
   namespace HexagonISD {
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 07872d4..3d7ace5 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "HexagonRegisterInfo.h"
 #include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
 #include "Hexagon.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index eb088c3..7306870 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -14,10 +14,10 @@
 #ifndef HexagonINSTRUCTIONINFO_H
 #define HexagonINSTRUCTIONINFO_H
 
+#include "HexagonRegisterInfo.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
-#include "HexagonRegisterInfo.h"
 
 
 #define GET_INSTRINFO_HEADER
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 06c732f..55cbc09 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -36,6 +36,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "hexagon-peephole"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
 #include "llvm/Constants.h"
 #include "llvm/PassSupport.h"
 #include "llvm/ADT/DenseMap.h"
@@ -45,16 +47,13 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include <algorithm>
-#include "Hexagon.h"
-#include "HexagonTargetMachine.h"
-
-#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index c481270..2a9de92 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Hexagon.h"
 #include "HexagonRegisterInfo.h"
+#include "Hexagon.h"
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
 #include "HexagonMachineFunctionInfo.h"
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index fc65305..6cf727b 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -16,9 +16,10 @@
 #define HexagonREGISTERINFO_H
 
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+
 #define GET_REGINFO_HEADER
 #include "HexagonGenRegisterInfo.inc"
-#include "llvm/MC/MachineLocation.h"
 
 //
 //  We try not to hard code the reserved registers in our code,
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 319eab2..b9e6894 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -50,7 +50,7 @@ extern "C" void LLVMInitializeHexagonTarget() {
 ///
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
-                                           TargetOptions Options,
+                                           const TargetOptions &Options,
                                            Reloc::Model RM,
                                            CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index 70bea56..0336965 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -14,13 +14,13 @@
 #ifndef HexagonTARGETMACHINE_H
 #define HexagonTARGETMACHINE_H
 
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "HexagonISelLowering.h"
 #include "HexagonSelectionDAGInfo.h"
 #include "HexagonFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
 
 namespace llvm {
 
@@ -37,8 +37,9 @@ class HexagonTargetMachine : public LLVMTargetMachine {
 
 public:
   HexagonTargetMachine(const Target &T, StringRef TT,StringRef CPU,
-                       StringRef FS, TargetOptions Options, Reloc::Model RM,
-                       CodeModel::Model CM, CodeGenOpt::Level OL);
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
 
   virtual const HexagonInstrInfo *getInstrInfo() const {
     return &InstrInfo;
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index d3ce5a6..32cc709 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -11,6 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "HexagonTargetObjectFile.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Target/TargetData.h"
@@ -18,9 +21,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/CommandLine.h"
-#include "HexagonSubtarget.h"
-#include "HexagonTargetObjectFile.h"
-#include "HexagonTargetMachine.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 74abc56..3cfa4fd 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -1,4 +1,4 @@
-//===-- HexagonMCTargetDesc.cpp - Cell Hexagon Target Descriptions --------===//
+//===-- HexagonMCTargetDesc.cpp - Hexagon Target Descriptions -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides Cell Hexagon specific target descriptions.
+// This file provides Hexagon specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 364841f..b18d23a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -1,4 +1,4 @@
-//===-- SPUMCTargetDesc.h - Hexagon Target Descriptions ---------*- C++ -*-===//
+//===-- HexagonMCTargetDesc.h - Hexagon Target Descriptions -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPUMCTARGETDESC_H
-#define SPUMCTARGETDESC_H
+#ifndef HEXAGONMCTARGETDESC_H
+#define HEXAGONMCTARGETDESC_H
 
 namespace llvm {
 class MCSubtargetInfo;
diff --git a/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt b/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt
index 1114d99..73c7e01 100644
--- a/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/CellSPU/MCTargetDesc/LLVMBuild.txt ----------*- Conf -*--===;
+;===- ./lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt ----------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/Hexagon/MCTargetDesc/Makefile b/lib/Target/Hexagon/MCTargetDesc/Makefile
index 67be2bc..885be2d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/Makefile
+++ b/lib/Target/Hexagon/MCTargetDesc/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/CellSPU/TargetDesc/Makefile --------------*- Makefile -*-===##
+##===- lib/Target/Hexagon/TargetDesc/Makefile --------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
index adedf93..6b958c8 100644
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
+++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
@@ -34,9 +34,9 @@ extern const MCInstrDesc MBlazeInsts[];
 
 using namespace llvm;
 
-const unsigned UNSUPPORTED = -1;
+const uint16_t UNSUPPORTED = -1;
 
-static const unsigned mblazeBinary2Opcode[] = {
+static const uint16_t mblazeBinary2Opcode[] = {
   MBlaze::ADD,   MBlaze::RSUB,   MBlaze::ADDC,   MBlaze::RSUBC,   //00,01,02,03
   MBlaze::ADDK,  MBlaze::RSUBK,  MBlaze::ADDKC,  MBlaze::RSUBKC,  //04,05,06,07
   MBlaze::ADDI,  MBlaze::RSUBI,  MBlaze::ADDIC,  MBlaze::RSUBIC,  //08,09,0A,0B
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.h b/lib/Target/MBlaze/MBlazeFrameLowering.h
index 8be15bf..01e6578 100644
--- a/lib/Target/MBlaze/MBlazeFrameLowering.h
+++ b/lib/Target/MBlaze/MBlazeFrameLowering.h
@@ -15,11 +15,10 @@
 #define MBLAZE_FRAMEINFO_H
 
 #include "MBlaze.h"
-#include "MBlazeSubtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
-  class MBlazeSubtarget;
+class MBlazeSubtarget;
 
 class MBlazeFrameLowering : public TargetFrameLowering {
 protected:
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 23c8e13..9ef6bb6 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -657,7 +657,7 @@ static bool CC_MBlaze_AssignReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                 CCValAssign::LocInfo &LocInfo,
                                 ISD::ArgFlagsTy &ArgFlags,
                                 CCState &State) {
-  static const unsigned ArgRegs[] = {
+  static const uint16_t ArgRegs[] = {
     MBlaze::R5, MBlaze::R6, MBlaze::R7,
     MBlaze::R8, MBlaze::R9, MBlaze::R10
   };
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
index 168694b..6a79fc1 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -15,11 +15,11 @@
 #ifndef MBlazeISELLOWERING_H
 #define MBlazeISELLOWERING_H
 
+#include "MBlaze.h"
+#include "MBlazeSubtarget.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
-#include "MBlaze.h"
-#include "MBlazeSubtarget.h"
 
 namespace llvm {
   namespace MBlazeCC {
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h
index a309d2b..5252147 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.h
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.h
@@ -15,9 +15,9 @@
 #define MBLAZEINSTRUCTIONINFO_H
 
 #include "MBlaze.h"
+#include "MBlazeRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "MBlazeRegisterInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "MBlazeGenInstrInfo.inc"
diff --git a/lib/Target/MBlaze/MBlazeMCInstLower.h b/lib/Target/MBlaze/MBlazeMCInstLower.h
index bb77ed4..7b97744 100644
--- a/lib/Target/MBlaze/MBlazeMCInstLower.h
+++ b/lib/Target/MBlaze/MBlazeMCInstLower.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
   class AsmPrinter;
-  class MCAsmInfo;
   class MCContext;
   class MCInst;
   class MCOperand;
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index 6801a1a..46f5207 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -14,9 +14,9 @@
 
 #define DEBUG_TYPE "mblaze-frame-info"
 
+#include "MBlazeRegisterInfo.h"
 #include "MBlaze.h"
 #include "MBlazeSubtarget.h"
-#include "MBlazeRegisterInfo.h"
 #include "MBlazeMachineFunction.h"
 #include "llvm/Constants.h"
 #include "llvm/Type.h"
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index 5c07424..dd7de9b 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MBlaze.h"
 #include "MBlazeTargetMachine.h"
+#include "MBlaze.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/FormattedStream.h"
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index fd5de34..c03ba47 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MSP430.h"
 #include "MSP430InstrInfo.h"
+#include "MSP430.h"
 #include "MSP430MachineFunctionInfo.h"
 #include "MSP430TargetMachine.h"
 #include "llvm/Function.h"
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index fe2a75c..04f339b 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_TARGET_MSP430INSTRINFO_H
 #define LLVM_TARGET_MSP430INSTRINFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
 #include "MSP430RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "MSP430GenInstrInfo.inc"
diff --git a/lib/Target/MSP430/MSP430MCInstLower.h b/lib/Target/MSP430/MSP430MCInstLower.h
index 297efd2..24151e2 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.h
+++ b/lib/Target/MSP430/MSP430MCInstLower.h
@@ -14,7 +14,6 @@
 
 namespace llvm {
   class AsmPrinter;
-  class MCAsmInfo;
   class MCContext;
   class MCInst;
   class MCOperand;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index f9ddfb3..51ec71a 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -13,9 +13,9 @@
 
 #define DEBUG_TYPE "msp430-reg-info"
 
+#include "MSP430RegisterInfo.h"
 #include "MSP430.h"
 #include "MSP430MachineFunctionInfo.h"
-#include "MSP430RegisterInfo.h"
 #include "MSP430TargetMachine.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index e7bebbd..4d8792e 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -15,12 +15,11 @@
 #define LLVM_TARGET_MSP430_SUBTARGET_H
 
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "MSP430GenSubtargetInfo.inc"
 
-#include <string>
-
 namespace llvm {
 class StringRef;
 
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index af62e48..9f2eda1 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MSP430.h"
 #include "MSP430TargetMachine.h"
+#include "MSP430.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/MCAsmInfo.h"
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index d69570b..9d5a2f1 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -92,25 +92,42 @@ public:
     if (!Value)
       return; // Doesn't change encoding.
 
+    // Where do we start in the object
     unsigned Offset = Fixup.getOffset();
-    // FIXME: The below code will not work across endian models
-    // How many bytes/bits are we fixing up?
-    unsigned NumBytes = ((getFixupKindInfo(Kind).TargetSize-1)/8)+1;
-    uint64_t Mask = ((uint64_t)1 << getFixupKindInfo(Kind).TargetSize) - 1;
+    // Number of bytes we need to fixup
+    unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+    // Used to point to big endian bytes
+    unsigned FullSize;
+
+    switch ((unsigned)Kind) {
+    case Mips::fixup_Mips_16:
+      FullSize = 2;
+      break;
+    case Mips::fixup_Mips_64:
+      FullSize = 8;
+      break;
+    default:
+      FullSize = 4;
+      break;
+    }
 
     // Grab current value, if any, from bits.
     uint64_t CurVal = 0;
-    for (unsigned i = 0; i != NumBytes; ++i)
-      CurVal |= ((uint8_t)Data[Offset + i]) << (i * 8);
 
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = IsLittle ? i : (FullSize - 1 - i);
+      CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
+    }
+
+    uint64_t Mask = ((uint64_t)(-1) >> (64 - getFixupKindInfo(Kind).TargetSize));
     CurVal = (CurVal & ~Mask) | ((CurVal + Value) & Mask);
 
-    // Write out the bytes back to the code/data bits.
-    // First the unaffected bits and then the fixup.
+    // Write out the fixed up bytes back to the code/data bits.
     for (unsigned i = 0; i != NumBytes; ++i) {
-      Data[Offset + i] = uint8_t((CurVal >> (i * 8)) & 0xff);
+      unsigned Idx = IsLittle ? i : (FullSize - 1 - i);
+      Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
     }
-}
+  }
 
   unsigned getNumFixupKinds() const { return Mips::NumTargetFixupKinds; }
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index b039678..9ebb6d2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -49,9 +49,9 @@ public:
 
   void EmitInstruction(uint64_t Val, unsigned Size, raw_ostream &OS) const {
     // Output the instruction encoding in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, OS);
-      Val >>= 8;
+    for (unsigned i = 0; i < Size; ++i) {
+      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+      EmitByte((Val >> Shift) & 0xff, OS);
     }
   }
 
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index bacecf2..bafadc8 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -21,8 +21,6 @@
 namespace llvm {
   class MipsTargetMachine;
   class FunctionPass;
-  class MachineCodeEmitter;
-  class formatted_raw_ostream;
 
   FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
index 31b669a..dc8fbd0 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -26,28 +26,28 @@ void MipsAnalyzeImmediate::AddInstr(InstSeqLs &SeqLs, const Inst &I) {
     Iter->push_back(I);
 }
 
-void MipsAnalyzeImmediate::GetInstSeqLsADDiu(int64_t Imm, unsigned RemSize,
+void MipsAnalyzeImmediate::GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize,
                                              InstSeqLs &SeqLs) {
-  GetInstSeqLs((Imm + 0x8000) & ~0xffff, RemSize, SeqLs);
-  AddInstr(SeqLs, Inst(ADDiu, Imm & 0xffff));
+  GetInstSeqLs((Imm + 0x8000ULL) & 0xffffffffffff0000ULL, RemSize, SeqLs);
+  AddInstr(SeqLs, Inst(ADDiu, Imm & 0xffffULL));
 }
 
-void MipsAnalyzeImmediate::GetInstSeqLsORi(int64_t Imm, unsigned RemSize,
+void MipsAnalyzeImmediate::GetInstSeqLsORi(uint64_t Imm, unsigned RemSize,
                                            InstSeqLs &SeqLs) {
-  GetInstSeqLs(Imm & ~0xffff, RemSize, SeqLs);
-  AddInstr(SeqLs, Inst(ORi, Imm & 0xffff));
+  GetInstSeqLs(Imm & 0xffffffffffff0000ULL, RemSize, SeqLs);
+  AddInstr(SeqLs, Inst(ORi, Imm & 0xffffULL));
 }
 
-void MipsAnalyzeImmediate::GetInstSeqLsSLL(int64_t Imm, unsigned RemSize,
+void MipsAnalyzeImmediate::GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize,
                                            InstSeqLs &SeqLs) {
   unsigned Shamt = CountTrailingZeros_64(Imm);
   GetInstSeqLs(Imm >> Shamt, RemSize - Shamt, SeqLs);
   AddInstr(SeqLs, Inst(SLL, Shamt));
 }
 
-void MipsAnalyzeImmediate::GetInstSeqLs(int64_t Imm, unsigned RemSize,
+void MipsAnalyzeImmediate::GetInstSeqLs(uint64_t Imm, unsigned RemSize,
                                         InstSeqLs &SeqLs) {
-  int64_t MaskedImm = Imm & (((uint64_t)-1) >> (64 - Size));
+  uint64_t MaskedImm = Imm & (0xffffffffffffffffULL >> (64 - Size));
 
   // Do nothing if Imm is 0.
   if (!MaskedImm)
@@ -122,7 +122,7 @@ void MipsAnalyzeImmediate::GetShortestSeq(InstSeqLs &SeqLs, InstSeq &Insts) {
 }
 
 const MipsAnalyzeImmediate::InstSeq
-&MipsAnalyzeImmediate::Analyze(int64_t Imm, unsigned Size,
+&MipsAnalyzeImmediate::Analyze(uint64_t Imm, unsigned Size,
                                bool LastInstrIsADDiu) {
   this->Size = Size;
 
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.h b/lib/Target/Mips/MipsAnalyzeImmediate.h
index 24e6e5f..a094dda 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.h
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.h
@@ -25,7 +25,7 @@ namespace llvm {
     /// Analyze - Get an instrucion sequence to load immediate Imm. The last
     /// instruction in the sequence must be an ADDiu if LastInstrIsADDiu is
     /// true;
-    const InstSeq &Analyze(int64_t Imm, unsigned Size, bool LastInstrIsADDiu);
+    const InstSeq &Analyze(uint64_t Imm, unsigned Size, bool LastInstrIsADDiu);
   private:
     typedef SmallVector<InstSeq, 5> InstSeqLs;
 
@@ -34,18 +34,18 @@ namespace llvm {
 
     /// GetInstSeqLsADDiu - Get instrucion sequences which end with an ADDiu to
     /// load immediate Imm
-    void GetInstSeqLsADDiu(int64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+    void GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
     /// GetInstSeqLsORi - Get instrucion sequences which end with an ORi to
     /// load immediate Imm
-    void GetInstSeqLsORi(int64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+    void GetInstSeqLsORi(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
     /// GetInstSeqLsSLL - Get instrucion sequences which end with a SLL to
     /// load immediate Imm
-    void GetInstSeqLsSLL(int64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+    void GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
     /// GetInstSeqLs - Get instrucion sequences to load immediate Imm.
-    void GetInstSeqLs(int64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+    void GetInstSeqLs(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
     /// ReplaceADDiuSLLWithLUi - Replace an ADDiu & SLL pair with a LUi.
     void ReplaceADDiuSLLWithLUi(InstSeq &Seq);
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index aeabc0f..f2b842a 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-asm-printer"
-#include "Mips.h"
 #include "MipsAsmPrinter.h"
+#include "Mips.h"
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
 #include "MipsMCInstLower.h"
@@ -34,8 +34,6 @@
 #include "llvm/Instructions.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/TargetRegistry.h"
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 8502db2..473da7e 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -22,9 +22,9 @@
 namespace llvm {
 class MCStreamer;
 class MachineInstr;
-class raw_ostream;
 class MachineBasicBlock;
 class Module;
+class raw_ostream;
 
 class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
 
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index e83c64e..ebfbb4a 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsAnalyzeImmediate.h"
 #include "MipsFrameLowering.h"
+#include "MipsAnalyzeImmediate.h"
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 782d203..536879e 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -99,6 +99,8 @@ private:
     return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
   }
 
+  void ProcessFunctionAfterISel(MachineFunction &MF);
+  bool ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
   void InitGlobalBaseReg(MachineFunction &MF);
 
   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
@@ -181,10 +183,57 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
   }
 }
 
+bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI,
+                                              const MachineInstr& MI) {
+  unsigned DstReg = 0, ZeroReg = 0;
+
+  // Check if MI is "addiu $dst, $zero, 0" or "daddiu $dst, $zero, 0".
+  if ((MI.getOpcode() == Mips::ADDiu) &&
+      (MI.getOperand(1).getReg() == Mips::ZERO) &&
+      (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO;
+  } else if ((MI.getOpcode() == Mips::DADDiu) &&
+             (MI.getOperand(1).getReg() == Mips::ZERO_64) &&
+             (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO_64;
+  }
+
+  if (!DstReg)
+    return false;
+
+  // Replace uses with ZeroReg.
+  for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
+       E = MRI->use_end(); U != E; ++U) {
+    MachineOperand &MO = U.getOperand();
+    MachineInstr *MI = MO.getParent();
+
+    // Do not replace if it is a phi's operand or is tied to def operand.
+    if (MI->isPHI() || MI->isRegTiedToDefOperand(U.getOperandNo()))
+      continue;
+
+    MO.setReg(ZeroReg);
+  }
+
+  return true;
+}
+
+void MipsDAGToDAGISel::ProcessFunctionAfterISel(MachineFunction &MF) {
+  InitGlobalBaseReg(MF);
+
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
+       ++MFI)
+    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
+      ReplaceUsesWithZeroReg(MRI, *I);
+}
+
 bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
 
-  InitGlobalBaseReg(MF);
+  ProcessFunctionAfterISel(MF);
 
   return Ret;
 }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index dc894d9..ecde5b6 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -18,13 +18,13 @@
 #include "MipsTargetMachine.h"
 #include "MipsTargetObjectFile.h"
 #include "MipsSubtarget.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/CallingConv.h"
-#include "InstPrinter/MipsInstPrinter.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -130,22 +130,32 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   // Mips Custom Operations
   setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
-  setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
   setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
-  setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
   setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
-  setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
   setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
-  setOperationAction(ISD::JumpTable,          MVT::i64,   Custom);
   setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
-  setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Custom);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64,   Custom);
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+  setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
+  setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
+  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
+
+  if (HasMips64) {
+    setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
+    setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
+    setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
+    setOperationAction(ISD::JumpTable,          MVT::i64,   Custom);
+    setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
+    setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64,   Custom);
+  }
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
   setOperationAction(ISD::SREM, MVT::i32, Expand);
@@ -185,8 +195,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
-  setOperationAction(ISD::FCOPYSIGN,         MVT::f32,   Custom);
-  setOperationAction(ISD::FCOPYSIGN,         MVT::f64,   Custom);
   setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
   setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
@@ -214,9 +222,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
 
-  setOperationAction(ISD::MEMBARRIER,        MVT::Other, Custom);
-  setOperationAction(ISD::ATOMIC_FENCE,      MVT::Other, Custom);
-
   setOperationAction(ISD::ATOMIC_LOAD,       MVT::i32,    Expand);
   setOperationAction(ISD::ATOMIC_LOAD,       MVT::i64,    Expand);
   setOperationAction(ISD::ATOMIC_STORE,      MVT::i32,    Expand);
@@ -246,11 +251,11 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::SDIVREM);
   setTargetDAGCombine(ISD::UDIVREM);
-  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
 
-  setMinFunctionAlignment(2);
+  setMinFunctionAlignment(HasMips64 ? 3 : 2);
 
   setStackPointerRegisterToSaveRestore(IsN64 ? Mips::SP_64 : Mips::SP);
   computeRegisterProperties();
@@ -559,21 +564,37 @@ static SDValue CreateCMovFP(SelectionDAG& DAG, SDValue Cond, SDValue True,
                      True.getValueType(), True, False, Cond);
 }
 
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG& DAG,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   const MipsSubtarget* Subtarget) {
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG& DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget* Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  SDValue Cond = CreateFPCmp(DAG, SDValue(N, 0));
+  SDValue SetCC = N->getOperand(0);
 
-  if (Cond.getOpcode() != MipsISD::FPCmp)
+  if ((SetCC.getOpcode() != ISD::SETCC) ||
+      !SetCC.getOperand(0).getValueType().isInteger())
     return SDValue();
 
-  SDValue True  = DAG.getConstant(1, MVT::i32);
-  SDValue False = DAG.getConstant(0, MVT::i32);
+  SDValue False = N->getOperand(2);
+  EVT FalseTy = False.getValueType();
 
-  return CreateCMovFP(DAG, Cond, True, False, N->getDebugLoc());
+  if (!FalseTy.isInteger())
+    return SDValue();
+
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(False);
+
+  if (!CN || CN->getZExtValue())
+    return SDValue();
+
+  const DebugLoc DL = N->getDebugLoc();
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  SDValue True = N->getOperand(1);
+  
+  SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+                       SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+  
+  return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
 }
 
 static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG,
@@ -684,8 +705,8 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   case ISD::SDIVREM:
   case ISD::UDIVREM:
     return PerformDivRemCombine(N, DAG, DCI, Subtarget);
-  case ISD::SETCC:
-    return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+  case ISD::SELECT:
+    return PerformSELECTCombine(N, DAG, DCI, Subtarget);  
   case ISD::AND:
     return PerformANDCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:
@@ -708,6 +729,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
     case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
     case ISD::SELECT:             return LowerSELECT(Op, DAG);
+    case ISD::SETCC:              return LowerSETCC(Op, DAG);
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
     case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
     case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
@@ -1475,6 +1497,18 @@ LowerSELECT(SDValue Op, SelectionDAG &DAG) const
                       Op.getDebugLoc());
 }
 
+SDValue MipsTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = CreateFPCmp(DAG, Op);
+
+  assert(Cond.getOpcode() == MipsISD::FPCmp &&
+         "Floating point operand expected.");
+
+  SDValue True  = DAG.getConstant(1, MVT::i32);
+  SDValue False = DAG.getConstant(0, MVT::i32);
+
+  return CreateCMovFP(DAG, Cond, True, False, Op.getDebugLoc());
+}
+
 SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
@@ -1841,13 +1875,13 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
 
   static const unsigned IntRegsSize=4, FloatRegsSize=2;
 
-  static const unsigned IntRegs[] = {
+  static const uint16_t IntRegs[] = {
       Mips::A0, Mips::A1, Mips::A2, Mips::A3
   };
-  static const unsigned F32Regs[] = {
+  static const uint16_t F32Regs[] = {
       Mips::F12, Mips::F14
   };
-  static const unsigned F64Regs[] = {
+  static const uint16_t F64Regs[] = {
       Mips::D6, Mips::D7
   };
 
@@ -1926,10 +1960,10 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
   return false; // CC must always match
 }
 
-static const unsigned Mips64IntRegs[8] =
+static const uint16_t Mips64IntRegs[8] =
   {Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
    Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64};
-static const unsigned Mips64DPRegs[8] =
+static const uint16_t Mips64DPRegs[8] =
   {Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
    Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64};
 
@@ -1996,7 +2030,7 @@ AnalyzeMips64CallOperands(CCState &CCInfo,
 
 static const unsigned O32IntRegsSize = 4;
 
-static const unsigned O32IntRegs[] = {
+static const uint16_t O32IntRegs[] = {
   Mips::A0, Mips::A1, Mips::A2, Mips::A3
 };
 
@@ -2115,9 +2149,9 @@ PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
   if (!IsRegLoc)
     LocMemOffset = VA.getLocMemOffset();
   else {
-    const unsigned *Reg = std::find(Mips64IntRegs, Mips64IntRegs + 8,
+    const uint16_t *Reg = std::find(Mips64IntRegs, Mips64IntRegs + 8,
                                     VA.getLocReg());
-    const unsigned *RegEnd = Mips64IntRegs + 8;
+    const uint16_t *RegEnd = Mips64IntRegs + 8;
 
     // Copy double words to registers.
     for (; (Reg != RegEnd) && (ByValSize >= Offset + 8); ++Reg, Offset += 8) {
@@ -2540,7 +2574,7 @@ CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl,
                     MachineFrameInfo *MFI, bool IsRegLoc,
                     SmallVectorImpl<SDValue> &InVals, MipsFunctionInfo *MipsFI,
                     EVT PtrTy) {
-  const unsigned *Reg = Mips64IntRegs + 8;
+  const uint16_t *Reg = Mips64IntRegs + 8;
   int FOOffset; // Frame object offset from virtual frame pointer.
 
   if (IsRegLoc) {
@@ -2709,7 +2743,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
   if (isVarArg) {
     unsigned NumOfRegs = IsO32 ? 4 : 8;
-    const unsigned *ArgRegs = IsO32 ? O32IntRegs : Mips64IntRegs;
+    const uint16_t *ArgRegs = IsO32 ? O32IntRegs : Mips64IntRegs;
     unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumOfRegs);
     int FirstRegSlotOffset = IsO32 ? 0 : -64 ; // offset of $a0's slot.
     const TargetRegisterClass *RC
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 621bbec..66f45cd 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -15,10 +15,10 @@
 #ifndef MipsISELLOWERING_H
 #define MipsISELLOWERING_H
 
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Target/TargetLowering.h"
 #include "Mips.h"
 #include "MipsSubtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
   namespace MipsISD {
@@ -128,6 +128,7 @@ namespace llvm {
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 10caf30..4be727d 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -15,9 +15,9 @@
 #define MIPSINSTRUCTIONINFO_H
 
 #include "Mips.h"
+#include "MipsRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "MipsRegisterInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "MipsGenInstrInfo.inc"
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index be65298..0d51298 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -12,9 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MipsMCInstLower.h"
 #include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
-#include "MipsMCInstLower.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index cbd5264..20bb338 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -14,11 +14,9 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-  class MCAsmInfo;
   class MCContext;
   class MCInst;
   class MCOperand;
-  class MCSymbol;
   class MachineInstr;
   class MachineFunction;
   class Mangler;
@@ -38,7 +36,7 @@ public:
   void LowerCPLOAD(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
   void LowerCPRESTORE(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
   void LowerUnalignedLoadStore(const MachineInstr *MI,
-		                           SmallVector<MCInst, 4>& MCInsts);
+                               SmallVector<MCInst, 4>& MCInsts);
   void LowerSETGP01(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 57ff069..abb5404 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -14,10 +14,10 @@
 #ifndef MIPS_MACHINE_FUNCTION_INFO_H
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
-#include <utility>
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include <utility>
 
 namespace llvm {
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index e0ecba2..5cfda34 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -13,10 +13,10 @@
 
 #define DEBUG_TYPE "mips-reg-info"
 
+#include "MipsRegisterInfo.h"
 #include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
 #include "MipsSubtarget.h"
-#include "MipsRegisterInfo.h"
 #include "MipsMachineFunction.h"
 #include "llvm/Constants.h"
 #include "llvm/Type.h"
@@ -83,12 +83,12 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const
 
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
-  static const unsigned ReservedCPURegs[] = {
+  static const uint16_t ReservedCPURegs[] = {
     Mips::ZERO, Mips::AT, Mips::K0, Mips::K1,
     Mips::SP, Mips::FP, Mips::RA
   };
 
-  static const unsigned ReservedCPU64Regs[] = {
+  static const uint16_t ReservedCPU64Regs[] = {
     Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64,
     Mips::SP_64, Mips::FP_64, Mips::RA_64
   };
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 8806aaf..ad02231 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Mips.h"
 #include "MipsTargetMachine.h"
+#include "Mips.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/TargetRegistry.h"
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 19ae142..80c00e8 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -14,15 +14,15 @@
 #ifndef MIPSTARGETMACHINE_H
 #define MIPSTARGETMACHINE_H
 
-#include "MipsSubtarget.h"
+#include "MipsFrameLowering.h"
 #include "MipsInstrInfo.h"
 #include "MipsISelLowering.h"
-#include "MipsFrameLowering.h"
+#include "MipsJITInfo.h"
 #include "MipsSelectionDAGInfo.h"
+#include "MipsSubtarget.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
-#include "MipsJITInfo.h"
 
 namespace llvm {
   class formatted_raw_ostream;
diff --git a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
index 77a298d..a3e0f32 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
+++ b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
@@ -17,9 +17,9 @@
 #ifndef PTXBASEINFO_H
 #define PTXBASEINFO_H
 
+#include "PTXMCTargetDesc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "PTXMCTargetDesc.h"
 
 namespace llvm {
   namespace PTXStateSpace {
diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h
index 7d46cce..ffb92cb 100644
--- a/lib/Target/PTX/PTX.h
+++ b/lib/Target/PTX/PTX.h
@@ -1,4 +1,3 @@
-//===-- PTX.h - Top-level interface for PTX representation ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
index 58ac5f2..0b6ac7b 100644
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ b/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -14,8 +14,8 @@
 
 #define DEBUG_TYPE "ptx-asm-printer"
 
-#include "PTX.h"
 #include "PTXAsmPrinter.h"
+#include "PTX.h"
 #include "PTXMachineFunctionInfo.h"
 #include "PTXParamManager.h"
 #include "PTXRegisterInfo.h"
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index e5d4edc..db1c953 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PTX.h"
 #include "PTXISelLowering.h"
+#include "PTX.h"
 #include "PTXMachineFunctionInfo.h"
 #include "PTXRegisterInfo.h"
 #include "PTXSubtarget.h"
diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h
index fd20982..33220f4 100644
--- a/lib/Target/PTX/PTXISelLowering.h
+++ b/lib/Target/PTX/PTXISelLowering.h
@@ -18,8 +18,6 @@
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
-class PTXSubtarget;
-class PTXTargetMachine;
 
 namespace PTXISD {
   enum NodeType {
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
index 9d6cbf1..443cd54 100644
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ b/lib/Target/PTX/PTXInstrInfo.cpp
@@ -13,8 +13,8 @@
 
 #define DEBUG_TYPE "ptx-instrinfo"
 
-#include "PTX.h"
 #include "PTXInstrInfo.h"
+#include "PTX.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
diff --git a/lib/Target/PTX/PTXParamManager.cpp b/lib/Target/PTX/PTXParamManager.cpp
index 74538e6..cc1cc71 100644
--- a/lib/Target/PTX/PTXParamManager.cpp
+++ b/lib/Target/PTX/PTXParamManager.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PTX.h"
 #include "PTXParamManager.h"
+#include "PTX.h"
 #include "llvm/ADT/StringExtras.h"
 
 using namespace llvm;
diff --git a/lib/Target/PTX/PTXParamManager.h b/lib/Target/PTX/PTXParamManager.h
index 32342f7..92e7728 100644
--- a/lib/Target/PTX/PTXParamManager.h
+++ b/lib/Target/PTX/PTXParamManager.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include <string>
 
 namespace llvm {
 
diff --git a/lib/Target/PTX/PTXRegisterInfo.cpp b/lib/Target/PTX/PTXRegisterInfo.cpp
index 3f087cd..b6ffd38 100644
--- a/lib/Target/PTX/PTXRegisterInfo.cpp
+++ b/lib/Target/PTX/PTXRegisterInfo.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PTX.h"
 #include "PTXRegisterInfo.h"
+#include "PTX.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
index 9305377..40835d0 100644
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ b/lib/Target/PTX/PTXTargetMachine.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PTX.h"
 #include "PTXTargetMachine.h"
+#include "PTX.h"
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
@@ -26,6 +26,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
@@ -37,8 +38,6 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
 
 
 using namespace llvm;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 02dad45..9c6eefe 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAsmBackend.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 5dc1863..24a7178 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -25,14 +25,11 @@
 namespace llvm {
   class PPCTargetMachine;
   class FunctionPass;
-  class formatted_raw_ostream;
   class JITCodeEmitter;
-  class Target;
   class MachineInstr;
   class AsmPrinter;
   class MCInst;
-  class TargetMachine;
-  
+
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 591ae02..4abb469 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "PPC.h"
 #include "PPCTargetMachine.h"
 #include "PPCSubtarget.h"
+#include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
@@ -53,7 +54,6 @@
 #include "llvm/Support/ELF.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
-#include "InstPrinter/PPCInstPrinter.h"
 using namespace llvm;
 
 namespace {
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index 8efc9c1..9883c2e 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -130,3 +130,34 @@ def CC_PPC_SVR4_ByVal : CallingConv<[
   CCCustom<"CC_PPC_SVR4_Custom_Dummy">
 ]>;
 
+def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
+                                        R21, R22, R23, R24, R25, R26, R27, R28,
+                                        R29, R30, R31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
+                                        V20, V21, V22, V23, V24, V25, V26, V27,
+                                        V28, V29, V30, V31)>;
+
+def CSR_SVR432   : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20, VRSAVE,
+                                        R21, R22, R23, R24, R25, R26, R27, R28,
+                                        R29, R30, R31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
+                                        V20, V21, V22, V23, V24, V25, V26, V27,
+                                        V28, V29, V30, V31)>;
+
+def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
+                                        X21, X22, X23, X24, X25, X26, X27, X28,
+                                        X29, X30, X31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
+                                        V20, V21, V22, V23, V24, V25, V26, V27,
+                                        V28, V29, V30, V31)>;
+
+def CSR_SVR464   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20, VRSAVE,
+                                        X21, X22, X23, X24, X25, X26, X27, X28,
+                                        X29, X30, X31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
+                                        V20, V21, V22, V23, V24, V25, V26, V27,
+                                        V28, V29, V30, V31)>;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 6d612f7..b77a80b 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -38,7 +38,7 @@ using namespace llvm;
 
 /// VRRegNo - Map from a numbered VR register to its enum value.
 ///
-static const unsigned short VRRegNo[] = {
+static const uint16_t VRRegNo[] = {
  PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
  PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
  PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index 95d0d64..d80a385 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -14,10 +14,10 @@
 #ifndef PPCHAZRECS_H
 #define PPCHAZRECS_H
 
+#include "PPCInstrInfo.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "PPCInstrInfo.h"
 
 namespace llvm {
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index bfed7ba..85b5bc1 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -16,6 +16,11 @@
 #include "PPCPerfectShuffle.h"
 #include "PPCTargetMachine.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -24,16 +29,11 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
@@ -1547,7 +1547,7 @@ static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
                                             CCValAssign::LocInfo &LocInfo,
                                             ISD::ArgFlagsTy &ArgFlags,
                                             CCState &State) {
-  static const unsigned ArgRegs[] = {
+  static const uint16_t ArgRegs[] = {
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
@@ -1574,7 +1574,7 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
                                               CCValAssign::LocInfo &LocInfo,
                                               ISD::ArgFlagsTy &ArgFlags,
                                               CCState &State) {
-  static const unsigned ArgRegs[] = {
+  static const uint16_t ArgRegs[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8
   };
@@ -1598,8 +1598,8 @@ static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
 
 /// GetFPR - Get the set of FP registers that should be allocated for arguments,
 /// on Darwin.
-static const unsigned *GetFPR() {
-  static const unsigned FPR[] = {
+static const uint16_t *GetFPR() {
+  static const uint16_t FPR[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
   };
@@ -1780,13 +1780,13 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
   if (isVarArg) {
-    static const unsigned GPArgRegs[] = {
+    static const uint16_t GPArgRegs[] = {
       PPC::R3, PPC::R4, PPC::R5, PPC::R6,
       PPC::R7, PPC::R8, PPC::R9, PPC::R10,
     };
     const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
 
-    static const unsigned FPArgRegs[] = {
+    static const uint16_t FPArgRegs[] = {
       PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
       PPC::F8
     };
@@ -1879,18 +1879,18 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   // Area that is at least reserved in caller of this function.
   unsigned MinReservedArea = ArgOffset;
 
-  static const unsigned GPR_32[] = {           // 32-bit registers.
+  static const uint16_t GPR_32[] = {           // 32-bit registers.
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
-  static const unsigned GPR_64[] = {           // 64-bit registers.
+  static const uint16_t GPR_64[] = {           // 64-bit registers.
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
 
-  static const unsigned *FPR = GetFPR();
+  static const uint16_t *FPR = GetFPR();
 
-  static const unsigned VR[] = {
+  static const uint16_t VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
@@ -1901,7 +1901,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
-  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
+  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
 
   // In 32-bit non-varargs functions, the stack space for vectors is after the
   // stack space for non-vectors.  We do not use this space unless we have
@@ -2769,6 +2769,12 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
     (CallConv == CallingConv::Fast &&
      getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
 
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
@@ -3141,17 +3147,17 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
-  static const unsigned GPR_32[] = {           // 32-bit registers.
+  static const uint16_t GPR_32[] = {           // 32-bit registers.
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
-  static const unsigned GPR_64[] = {           // 64-bit registers.
+  static const uint16_t GPR_64[] = {           // 64-bit registers.
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
-  static const unsigned *FPR = GetFPR();
+  static const uint16_t *FPR = GetFPR();
 
-  static const unsigned VR[] = {
+  static const uint16_t VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
@@ -3159,7 +3165,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   const unsigned NumFPRs = 13;
   const unsigned NumVRs  = array_lengthof(VR);
 
-  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
+  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 3534e9c..2e046c4 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -15,10 +15,10 @@
 #ifndef LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
 #define LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
 
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "PPC.h"
 #include "PPCSubtarget.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 
 namespace llvm {
   namespace PPCISD {
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 02bffed..78f3596 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -64,13 +64,7 @@ let Defs = [LR8] in
                     PPC970_Unit_BRU;
 
 // Darwin ABI Calls.
-let isCall = 1, PPC970_Unit = 7, 
-  // All calls clobber the PPC64 non-callee saved registers.
-  Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,
-          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
-          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
-          LR8,CTR8,
-          CR0,CR1,CR5,CR6,CR7,CARRY] in {
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8_Darwin  : IForm<18, 0, 1,
@@ -90,13 +84,7 @@ let isCall = 1, PPC970_Unit = 7,
 
 // ELF 64 ABI Calls = Darwin ABI Calls
 // Used to define BL8_ELF and BLA8_ELF
-let isCall = 1, PPC970_Unit = 7, 
-  // All calls clobber the PPC64 non-callee saved registers.
-  Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,
-          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
-          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
-          LR8,CTR8,
-          CR0,CR1,CR5,CR6,CR7,CARRY] in {
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL8_ELF  : IForm<18, 0, 1,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index e5f171d..7d49aa1 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -15,8 +15,8 @@
 #define POWERPC_INSTRUCTIONINFO_H
 
 #include "PPC.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "PPCRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "PPCGenInstrInfo.inc"
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index e234012..939b71a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -438,13 +438,7 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
 }
 
 // Darwin ABI Calls.
-let isCall = 1, PPC970_Unit = 7, 
-  // All calls clobber the non-callee saved registers...
-  Defs = [R0,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,
-          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
-          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
-          LR,CTR,
-          CR0,CR1,CR5,CR6,CR7,CARRY] in {
+let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL_Darwin  : IForm<18, 0, 1,
@@ -463,13 +457,7 @@ let isCall = 1, PPC970_Unit = 7,
 }
 
 // SVR4 ABI Calls.
-let isCall = 1, PPC970_Unit = 7, 
-  // All calls clobber the non-callee saved registers...
-  Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,
-          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
-          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
-          LR,CTR,
-          CR0,CR1,CR5,CR6,CR7,CARRY] in {
+let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   // Convenient aliases for call instructions
   let Uses = [RM] in {
     def BL_SVR4  : IForm<18, 0, 1,
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 306cc1f..2976f01 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -13,10 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "reginfo"
+#include "PPCRegisterInfo.h"
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
 #include "PPCMachineFunctionInfo.h"
-#include "PPCRegisterInfo.h"
 #include "PPCFrameLowering.h"
 #include "PPCSubtarget.h"
 #include "llvm/CallingConv.h"
@@ -100,104 +100,20 @@ PPCRegisterInfo::getPointerRegClass(unsigned Kind) const {
 
 const uint16_t*
 PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  // 32-bit Darwin calling convention. 
-  static const uint16_t Darwin32_CalleeSavedRegs[] = {
-              PPC::R13, PPC::R14, PPC::R15,
-    PPC::R16, PPC::R17, PPC::R18, PPC::R19,
-    PPC::R20, PPC::R21, PPC::R22, PPC::R23,
-    PPC::R24, PPC::R25, PPC::R26, PPC::R27,
-    PPC::R28, PPC::R29, PPC::R30, PPC::R31,
-
-    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
-    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
-    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
-    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
-    PPC::F30, PPC::F31,
-    
-    PPC::CR2, PPC::CR3, PPC::CR4,
-    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
-    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
-    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
-    
-    PPC::LR,  0
-  };
-
-  // 32-bit SVR4 calling convention.
-  static const uint16_t SVR4_CalleeSavedRegs[] = {
-                        PPC::R14, PPC::R15,
-    PPC::R16, PPC::R17, PPC::R18, PPC::R19,
-    PPC::R20, PPC::R21, PPC::R22, PPC::R23,
-    PPC::R24, PPC::R25, PPC::R26, PPC::R27,
-    PPC::R28, PPC::R29, PPC::R30, PPC::R31,
-
-    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
-    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
-    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
-    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
-    PPC::F30, PPC::F31,
-    
-    PPC::CR2, PPC::CR3, PPC::CR4,
-    
-    PPC::VRSAVE,
-    
-    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
-    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
-    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
-    
-    0
-  };
-  // 64-bit Darwin calling convention. 
-  static const uint16_t Darwin64_CalleeSavedRegs[] = {
-    PPC::X14, PPC::X15,
-    PPC::X16, PPC::X17, PPC::X18, PPC::X19,
-    PPC::X20, PPC::X21, PPC::X22, PPC::X23,
-    PPC::X24, PPC::X25, PPC::X26, PPC::X27,
-    PPC::X28, PPC::X29, PPC::X30, PPC::X31,
-    
-    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
-    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
-    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
-    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
-    PPC::F30, PPC::F31,
-    
-    PPC::CR2, PPC::CR3, PPC::CR4,
-    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
-    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
-    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
-    
-    PPC::LR8,  0
-  };
-
-  // 64-bit SVR4 calling convention.
-  static const uint16_t SVR4_64_CalleeSavedRegs[] = {
-    PPC::X14, PPC::X15,
-    PPC::X16, PPC::X17, PPC::X18, PPC::X19,
-    PPC::X20, PPC::X21, PPC::X22, PPC::X23,
-    PPC::X24, PPC::X25, PPC::X26, PPC::X27,
-    PPC::X28, PPC::X29, PPC::X30, PPC::X31,
-
-    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
-    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
-    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
-    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
-    PPC::F30, PPC::F31,
-
-    PPC::CR2, PPC::CR3, PPC::CR4,
-
-    PPC::VRSAVE,
+  if (Subtarget.isDarwinABI())
+    return Subtarget.isPPC64() ? CSR_Darwin64_SaveList :
+                                 CSR_Darwin32_SaveList;
 
-    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
-    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
-    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+  return Subtarget.isPPC64() ? CSR_SVR464_SaveList : CSR_SVR432_SaveList;
+}
 
-    0
-  };
-  
+const unsigned*
+PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   if (Subtarget.isDarwinABI())
-    return Subtarget.isPPC64() ? Darwin64_CalleeSavedRegs :
-                                 Darwin32_CalleeSavedRegs;
+    return Subtarget.isPPC64() ? CSR_Darwin64_RegMask :
+                                 CSR_Darwin32_RegMask;
 
-  return Subtarget.isPPC64() ? SVR4_64_CalleeSavedRegs : SVR4_CalleeSavedRegs;
+  return Subtarget.isPPC64() ? CSR_SVR464_RegMask : CSR_SVR432_RegMask;
 }
 
 BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 6ce90bc..b1e6a72 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -42,6 +42,7 @@ public:
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+  const unsigned *getCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index da20274..ba9c779 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
 #include "PPCTargetMachine.h"
+#include "PPC.h"
 #include "llvm/PassManager.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/CodeGen/Passes.h"
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 6dd11c9..7da2b0c 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -24,8 +24,6 @@
 #include "llvm/Target/TargetData.h"
 
 namespace llvm {
-class PassManager;
-class GlobalValue;
 
 /// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets.
 ///
diff --git a/lib/Target/Sparc/FPMover.cpp b/lib/Target/Sparc/FPMover.cpp
index 1423b1e..9a729bd 100644
--- a/lib/Target/Sparc/FPMover.cpp
+++ b/lib/Target/Sparc/FPMover.cpp
@@ -59,19 +59,19 @@ FunctionPass *llvm::createSparcFPMoverPass(TargetMachine &tm) {
 /// registers that correspond to it.
 static void getDoubleRegPair(unsigned DoubleReg, unsigned &EvenReg,
                              unsigned &OddReg) {
-  static const unsigned EvenHalvesOfPairs[] = {
+  static const uint16_t EvenHalvesOfPairs[] = {
     SP::F0, SP::F2, SP::F4, SP::F6, SP::F8, SP::F10, SP::F12, SP::F14,
     SP::F16, SP::F18, SP::F20, SP::F22, SP::F24, SP::F26, SP::F28, SP::F30
   };
-  static const unsigned OddHalvesOfPairs[] = {
+  static const uint16_t OddHalvesOfPairs[] = {
     SP::F1, SP::F3, SP::F5, SP::F7, SP::F9, SP::F11, SP::F13, SP::F15,
     SP::F17, SP::F19, SP::F21, SP::F23, SP::F25, SP::F27, SP::F29, SP::F31
   };
-  static const unsigned DoubleRegsInOrder[] = {
+  static const uint16_t DoubleRegsInOrder[] = {
     SP::D0, SP::D1, SP::D2, SP::D3, SP::D4, SP::D5, SP::D6, SP::D7, SP::D8,
     SP::D9, SP::D10, SP::D11, SP::D12, SP::D13, SP::D14, SP::D15
   };
-  for (unsigned i = 0; i < sizeof(DoubleRegsInOrder)/sizeof(unsigned); ++i)
+  for (unsigned i = 0; i < array_lengthof(DoubleRegsInOrder); ++i)
     if (DoubleRegsInOrder[i] == DoubleReg) {
       EvenReg = EvenHalvesOfPairs[i];
       OddReg = OddHalvesOfPairs[i];
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index a6b63fb..ee12633 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -50,7 +50,7 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
                                 MVT &LocVT, CCValAssign::LocInfo &LocInfo,
                                 ISD::ArgFlagsTy &ArgFlags, CCState &State)
 {
-  static const unsigned RegList[] = {
+  static const uint16_t RegList[] = {
     SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
   };
   //Try to get first reg
@@ -301,11 +301,11 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Store remaining ArgRegs to the stack if this is a varargs function.
   if (isVarArg) {
-    static const unsigned ArgRegs[] = {
+    static const uint16_t ArgRegs[] = {
       SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
     };
     unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs, 6);
-    const unsigned *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
+    const uint16_t *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
     unsigned ArgOffset = CCInfo.getNextStackOffset();
     if (NumAllocated == 6)
       ArgOffset += StackOffset;
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 4a7c479..f483c96 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -15,8 +15,8 @@
 #ifndef SPARC_ISELLOWERING_H
 #define SPARC_ISELLOWERING_H
 
-#include "llvm/Target/TargetLowering.h"
 #include "Sparc.h"
+#include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
   namespace SPISD {
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index 4932531..204f698 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -14,8 +14,8 @@
 #ifndef SPARCINSTRUCTIONINFO_H
 #define SPARCINSTRUCTIONINFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
 #include "SparcRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "SparcGenInstrInfo.inc"
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index c392fcc..6357468 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -11,15 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Sparc.h"
 #include "SparcRegisterInfo.h"
+#include "Sparc.h"
 #include "SparcSubtarget.h"
+#include "llvm/Type.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 80a3be6..6f31356 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Sparc.h"
 #include "SparcTargetMachine.h"
+#include "Sparc.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/TargetRegistry.h"
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d91830f..9e88472 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -67,11 +67,11 @@ private:
                                MCStreamer &Out);
 
   /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
-  /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode.
+  /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode.
   bool isSrcOp(X86Operand &Op);
 
-  /// isDstOp - Returns true if operand is either %es:(%rdi) in 64bit mode
-  /// or %es:(%edi) in 32bit mode.
+  /// isDstOp - Returns true if operand is either (%rdi) or %es:(%rdi)
+  /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode.
   bool isDstOp(X86Operand &Op);
 
   bool is64BitMode() const {
@@ -468,7 +468,8 @@ bool X86AsmParser::isSrcOp(X86Operand &Op) {
 bool X86AsmParser::isDstOp(X86Operand &Op) {
   unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI;
 
-  return Op.isMem() && Op.Mem.SegReg == X86::ES &&
+  return Op.isMem() && 
+    (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) &&
     isa<MCConstantExpr>(Op.Mem.Disp) &&
     cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
     Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0;
@@ -838,6 +839,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   // If we reached here, then we just ate the ( of the memory operand.  Process
   // the rest of the memory operand.
   unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+  SMLoc IndexLoc;
 
   if (getLexer().is(AsmToken::Percent)) {
     SMLoc StartLoc, EndLoc;
@@ -851,6 +853,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
 
   if (getLexer().is(AsmToken::Comma)) {
     Parser.Lex(); // Eat the comma.
+    IndexLoc = Parser.getTok().getLoc();
 
     // Following the comma we should have either an index register, or a scale
     // value. We don't support the later form, but we want to parse it
@@ -876,8 +879,10 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
           SMLoc Loc = Parser.getTok().getLoc();
 
           int64_t ScaleVal;
-          if (getParser().ParseAbsoluteExpression(ScaleVal))
+          if (getParser().ParseAbsoluteExpression(ScaleVal)){
+            Error(Loc, "expected scale expression");
             return 0;
+	  }
 
           // Validate the scale amount.
           if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
@@ -910,6 +915,23 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   SMLoc MemEnd = Parser.getTok().getLoc();
   Parser.Lex(); // Eat the ')'.
 
+  // If we have both a base register and an index register make sure they are
+  // both 64-bit or 32-bit registers.
+  if (BaseReg != 0 && IndexReg != 0) {
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+        !X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) &&
+        IndexReg != X86::RIZ) {
+      Error(IndexLoc, "index register is 32-bit, but base register is 64-bit");
+      return 0;
+    }
+    if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+        !X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) &&
+        IndexReg != X86::EIZ){
+      Error(IndexLoc, "index register is 64-bit, but base register is 32-bit");
+      return 0;
+    }
+  }
+
   return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
                                MemStart, MemEnd);
 }
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index b0e66f0..fbd81d2 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -312,6 +312,15 @@ static int readPrefixes(struct InternalInstruction* insn) {
     
     if (consumeByte(insn, &byte))
       return -1;
+
+    /*
+     * If the first byte is a LOCK prefix break and let it be disassembled
+     * as a lock "instruction", by creating an <MCInst #xxxx LOCK_PREFIX>.
+     * FIXME there is currently no way to get the disassembler to print the
+     * lock prefix if it is not the first byte.
+     */
+    if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+      break;
     
     switch (byte) {
     case 0xf0:  /* LOCK */
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 30a847f..f532019 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
                                   const char *(*getRegName)(unsigned)) {
   // If this is a shuffle operation, the switch should fill in this state.
-  SmallVector<unsigned, 8> ShuffleMask;
+  SmallVector<int, 8> ShuffleMask;
   const char *DestName = 0, *Src1Name = 0, *Src2Name = 0;
 
   switch (MI->getOpcode()) {
@@ -500,7 +500,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     if (Src1Name == Src2Name) {
       for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
         if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
-            ShuffleMask[i] >= e)        // From second mask.
+            ShuffleMask[i] >= (int)e)        // From second mask.
           ShuffleMask[i] -= e;
       }
     }
@@ -518,13 +518,13 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
       // Otherwise, it must come from src1 or src2.  Print the span of elements
       // that comes from this src.
-      bool isSrc1 = ShuffleMask[i] < ShuffleMask.size();
+      bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
       const char *SrcName = isSrc1 ? Src1Name : Src2Name;
       OS << (SrcName ? SrcName : "mem") << '[';
       bool IsFirst = true;
       while (i != e &&
              (int)ShuffleMask[i] >= 0 &&
-             (ShuffleMask[i] < ShuffleMask.size()) == isSrc1) {
+             (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
         if (!IsFirst)
           OS << ',';
         else
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 9ccbf1c..3f770f7 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAsmBackend.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 37727b6..80990e5 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -46,6 +46,11 @@ public:
     return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
   }
 
+  bool is32BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode64Bit) == 0;
+  }
+
   static unsigned GetX86RegNum(const MCOperand &MO) {
     return X86_MC::getX86RegNum(MO.getReg());
   }
@@ -154,9 +159,8 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
   return MCFixup::getKindForSize(Size, isPCRel);
 }
 
-/// Is32BitMemOperand - Return true if the specified instruction with a memory
-/// operand should emit the 0x67 prefix byte in 64-bit mode due to a 32-bit
-/// memory operand.  Op specifies the operand # of the memoperand.
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
 static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
   const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
   const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
@@ -169,6 +173,36 @@ static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
   return false;
 }
 
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+#endif
+
+/// Is16BitMemOperand - Return true if the specified instruction has
+/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is16BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
 /// StartsWithGlobalOffsetTable - Check if this expression starts with
 ///  _GLOBAL_OFFSET_TABLE_ and if it is of the form
 ///  _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
@@ -817,8 +851,22 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     EmitByte(0xF3, CurByte, OS);
 
   // Emit the address size opcode prefix as needed.
-  if ((TSFlags & X86II::AdSize) ||
-      (MemOperand != -1 && is64BitMode() && Is32BitMemOperand(MI, MemOperand)))
+  bool need_address_override;
+  if (TSFlags & X86II::AdSize) {
+    need_address_override = true;
+  } else if (MemOperand == -1) {
+    need_address_override = false;
+  } else if (is64BitMode()) {
+    assert(!Is16BitMemOperand(MI, MemOperand));
+    need_address_override = Is32BitMemOperand(MI, MemOperand);
+  } else if (is32BitMode()) {
+    assert(!Is64BitMemOperand(MI, MemOperand));
+    need_address_override = Is16BitMemOperand(MI, MemOperand);
+  } else {
+    need_address_override = false;
+  }
+
+  if (need_address_override)
     EmitByte(0x67, CurByte, OS);
 
   // Emit the operand size opcode prefix as needed.
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index a581993..624e56f 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -922,3 +922,22 @@ _test2:                                 ## @test2
 The insertps's of $0 are pointless complex copies.
 
 //===---------------------------------------------------------------------===//
+
+[UNSAFE FP]
+
+void foo(double, double, double);
+void norm(double x, double y, double z) {
+  double scale = __builtin_sqrt(x*x + y*y + z*z);
+  foo(x/scale, y/scale, z/scale);
+}
+
+We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
+slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
+and emit 3 mulsd in place of the divs. This can be done as a target-independent
+transform.
+
+If we're dealing with floats instead of doubles we could even replace the sqrtss
+and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
+cost of reduced accuracy.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index f4b85ae..32c722a 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -20,7 +20,7 @@
 
 namespace llvm {
 
-void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   // Defaults the copying the dest value.
   ShuffleMask.push_back(0);
   ShuffleMask.push_back(1);
@@ -44,8 +44,7 @@ void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask) {
 }
 
 // <3,1> or <6,7,2,3>
-void DecodeMOVHLPSMask(unsigned NElts,
-                       SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
   for (unsigned i = NElts/2; i != NElts; ++i)
     ShuffleMask.push_back(NElts+i);
 
@@ -54,8 +53,7 @@ void DecodeMOVHLPSMask(unsigned NElts,
 }
 
 // <0,2> or <0,1,4,5>
-void DecodeMOVLHPSMask(unsigned NElts,
-                       SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
   for (unsigned i = 0; i != NElts/2; ++i)
     ShuffleMask.push_back(i);
 
@@ -66,8 +64,7 @@ void DecodeMOVLHPSMask(unsigned NElts,
 /// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodePSHUFMask(EVT VT, unsigned Imm,
-                     SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   unsigned NumLanes = VT.getSizeInBits() / 128;
@@ -83,8 +80,7 @@ void DecodePSHUFMask(EVT VT, unsigned Imm,
   }
 }
 
-void DecodePSHUFHWMask(unsigned Imm,
-                       SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   ShuffleMask.push_back(0);
   ShuffleMask.push_back(1);
   ShuffleMask.push_back(2);
@@ -95,8 +91,7 @@ void DecodePSHUFHWMask(unsigned Imm,
   }
 }
 
-void DecodePSHUFLWMask(unsigned Imm,
-                       SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   for (unsigned i = 0; i != 4; ++i) {
     ShuffleMask.push_back((Imm & 3));
     Imm >>= 2;
@@ -110,8 +105,7 @@ void DecodePSHUFLWMask(unsigned Imm,
 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
 /// the type of the vector allowing it to handle different datatypes and vector
 /// widths.
-void DecodeSHUFPMask(EVT VT, unsigned Imm,
-                     SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   unsigned NumLanes = VT.getSizeInBits() / 128;
@@ -136,7 +130,7 @@ void DecodeSHUFPMask(EVT VT, unsigned Imm,
 /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
 /// and punpckh*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -156,7 +150,7 @@ void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
 /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// and punpckl*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -174,7 +168,7 @@ void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
 }
 
 void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
-                          SmallVectorImpl<unsigned> &ShuffleMask) {
+                          SmallVectorImpl<int> &ShuffleMask) {
   unsigned HalfSize = VT.getVectorNumElements()/2;
   unsigned FstHalfBegin = (Imm & 0x3) * HalfSize;
   unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 877c9bd..5b8c6ef 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -24,47 +24,41 @@
 
 namespace llvm {
 enum {
-  SM_SentinelZero = ~0U
+  SM_SentinelZero = -1
 };
 
-void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 // <3,1> or <6,7,2,3>
-void DecodeMOVHLPSMask(unsigned NElts,
-                       SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
 // <0,2> or <0,1,4,5>
-void DecodeMOVLHPSMask(unsigned NElts,
-                       SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFMask(EVT VT, unsigned Imm,
-                     SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodePSHUFMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFHWMask(unsigned Imm,
-                       SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodePSHUFHWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
-void DecodePSHUFLWMask(unsigned Imm,
-                       SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodePSHUFLWMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
 /// the type of the vector allowing it to handle different datatypes and vector
 /// widths.
-void DecodeSHUFPMask(EVT VT, unsigned Imm,
-                     SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeSHUFPMask(EVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
 /// and punpckh*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeUNPCKHMask(EVT VT, SmallVectorImpl<int> &ShuffleMask);
 
 /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// and punpckl*. VT indicates the type of the vector allowing it to handle
 /// different datatypes and vector widths.
-void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask);
 
 
 void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
-                          SmallVectorImpl<unsigned> &ShuffleMask);
+                          SmallVectorImpl<int> &ShuffleMask);
 
 } // llvm namespace
 
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 81e9422..ecc7b59 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -24,8 +24,6 @@ namespace llvm {
 
 class FunctionPass;
 class JITCodeEmitter;
-class MachineCodeEmitter;
-class Target;
 class X86TargetMachine;
 
 /// createX86ISelDag - This pass converts a legalized DAG into a 
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 268cbf4..f1cedf3 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -13,13 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86AsmPrinter.h"
-#include "InstPrinter/X86ATTInstPrinter.h"
-#include "InstPrinter/X86IntelInstPrinter.h"
 #include "X86MCInstLower.h"
 #include "X86.h"
 #include "X86COFFMachineModuleInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86TargetMachine.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
 #include "llvm/CallingConv.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 1058df5..a6ed9ba 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -24,11 +24,7 @@
 
 namespace llvm {
 
-class MachineJumpTableInfo;
-class MCContext;
-class MCInst;
 class MCStreamer;
-class MCSymbol;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget;
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
index 63c08f1..0cec95a 100644
--- a/lib/Target/X86/X86COFFMachineModuleInfo.h
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.h
@@ -14,9 +14,9 @@
 #ifndef X86COFF_MACHINEMODULEINFO_H
 #define X86COFF_MACHINEMODULEINFO_H
 
+#include "X86MachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/ADT/DenseSet.h"
-#include "X86MachineFunctionInfo.h"
 
 namespace llvm {
   class X86MachineFunctionInfo;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index f90764e..3d63b7e 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -1779,7 +1779,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
   if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) {
     // Count the number of XMM registers allocated.
-    static const unsigned XMMArgRegs[] = {
+    static const uint16_t XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 32de194..936df27 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -26,6 +26,7 @@
 #define DEBUG_TYPE "x86-codegen"
 #include "X86.h"
 #include "X86InstrInfo.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -37,7 +38,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/InlineAsm.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -570,8 +570,8 @@ void FPS::finishBlockStack() {
 
 namespace {
   struct TableEntry {
-    unsigned from;
-    unsigned to;
+    uint16_t from;
+    uint16_t to;
     bool operator<(const TableEntry &TE) const { return from < TE.from; }
     friend bool operator<(const TableEntry &TE, unsigned V) {
       return TE.from < V;
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index aa508b8..9405c2f 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -21,7 +21,6 @@
 #include "X86TargetMachine.h"
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Type.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -32,6 +31,7 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -1654,7 +1654,7 @@ enum AtomicSz {
   AtomicSzEnd
 };
 
-static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
   {
     X86::LOCK_OR8mi,
     X86::LOCK_OR8mr,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index cae9aad..88f3829 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1927,17 +1927,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
 
       // FIXME: We should really autogenerate these arrays
-      static const unsigned GPR64ArgRegsWin64[] = {
+      static const uint16_t GPR64ArgRegsWin64[] = {
         X86::RCX, X86::RDX, X86::R8,  X86::R9
       };
-      static const unsigned GPR64ArgRegs64Bit[] = {
+      static const uint16_t GPR64ArgRegs64Bit[] = {
         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
       };
-      static const unsigned XMMArgRegs64Bit[] = {
+      static const uint16_t XMMArgRegs64Bit[] = {
         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
       };
-      const unsigned *GPR64ArgRegs;
+      const uint16_t *GPR64ArgRegs;
       unsigned NumXMMRegs = 0;
 
       if (IsWin64) {
@@ -2326,7 +2326,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // registers used and is in the range 0 - 8 inclusive.
 
     // Count the number of XMM registers allocated.
-    static const unsigned XMMArgRegs[] = {
+    static const uint16_t XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
@@ -2910,7 +2910,7 @@ static bool isTargetShuffle(unsigned Opcode) {
 }
 
 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
-                                               SDValue V1, SelectionDAG &DAG) {
+                                    SDValue V1, SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
   case X86ISD::MOVSHDUP:
@@ -2921,7 +2921,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
 }
 
 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
-                          SDValue V1, unsigned TargetMask, SelectionDAG &DAG) {
+                                    SDValue V1, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
   case X86ISD::PSHUFD:
@@ -2933,7 +2934,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
 }
 
 static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
-               SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) {
+                                    SDValue V1, SDValue V2, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
   case X86ISD::PALIGN:
@@ -3712,6 +3714,8 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
 static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
                                bool V2IsSplat = false, bool V2IsUndef = false) {
   unsigned NumOps = VT.getVectorNumElements();
+  if (VT.getSizeInBits() == 256)
+    return false;
   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
     return false;
 
@@ -4342,9 +4346,81 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
   return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
 }
 
+/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
+/// target specific opcode. Returns true if the Mask could be calculated.
+/// Sets IsUnary to true if only uses one source.
+static bool getTargetShuffleMask(SDNode *N, EVT VT,
+                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SDValue ImmN;
+
+  IsUnary = false;
+  switch(N->getOpcode()) {
+  case X86ISD::SHUFP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::UNPCKH:
+    DecodeUNPCKHMask(VT, Mask);
+    break;
+  case X86ISD::UNPCKL:
+    DecodeUNPCKLMask(VT, Mask);
+    break;
+  case X86ISD::MOVHLPS:
+    DecodeMOVHLPSMask(NumElems, Mask);
+    break;
+  case X86ISD::MOVLHPS:
+    DecodeMOVLHPSMask(NumElems, Mask);
+    break;
+  case X86ISD::PSHUFD:
+  case X86ISD::VPERMILP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFHW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFLW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD: {
+    // The index 0 always comes from the first element of the second source,
+    // this is why MOVSS and MOVSD are used in the first place. The other
+    // elements come from the other positions of the first source vector
+    Mask.push_back(NumElems);
+    for (unsigned i = 1; i != NumElems; ++i) {
+      Mask.push_back(i);
+    }
+    break;
+  }
+  case X86ISD::VPERM2X128:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::PALIGN:
+    // Not yet implemented
+    return false;
+  default: llvm_unreachable("unknown target shuffle node");
+  }
+
+  return true;
+}
+
 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
 /// element of the result of the vector shuffle.
-static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
                                    unsigned Depth) {
   if (Depth == 6)
     return SDValue();  // Limit search depth.
@@ -4355,89 +4431,34 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
 
   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
-    Index = SV->getMaskElt(Index);
+    int Elt = SV->getMaskElt(Index);
 
-    if (Index < 0)
+    if (Elt < 0)
       return DAG.getUNDEF(VT.getVectorElementType());
 
     unsigned NumElems = VT.getVectorNumElements();
-    SDValue NewV = (Index < (int)NumElems) ? SV->getOperand(0)
-                                           : SV->getOperand(1);
-    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
+    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+                                         : SV->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   }
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
     unsigned NumElems = VT.getVectorNumElements();
-    SmallVector<unsigned, 16> ShuffleMask;
+    SmallVector<int, 16> ShuffleMask;
     SDValue ImmN;
+    bool IsUnary;
 
-    switch(Opcode) {
-    case X86ISD::SHUFP:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                      ShuffleMask);
-      break;
-    case X86ISD::UNPCKH:
-      DecodeUNPCKHMask(VT, ShuffleMask);
-      break;
-    case X86ISD::UNPCKL:
-      DecodeUNPCKLMask(VT, ShuffleMask);
-      break;
-    case X86ISD::MOVHLPS:
-      DecodeMOVHLPSMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::MOVLHPS:
-      DecodeMOVLHPSMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::PSHUFD:
-    case X86ISD::VPERMILP:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                      ShuffleMask);
-      break;
-    case X86ISD::PSHUFHW:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::PSHUFLW:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::MOVSS:
-    case X86ISD::MOVSD: {
-      // The index 0 always comes from the first element of the second source,
-      // this is why MOVSS and MOVSD are used in the first place. The other
-      // elements come from the other positions of the first source vector.
-      unsigned OpNum = (Index == 0) ? 1 : 0;
-      return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
-                                 Depth+1);
-    }
-    case X86ISD::VPERM2X128:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                           ShuffleMask);
-      break;
-    case X86ISD::MOVDDUP:
-    case X86ISD::MOVLHPD:
-    case X86ISD::MOVLPD:
-    case X86ISD::MOVLPS:
-    case X86ISD::MOVSHDUP:
-    case X86ISD::MOVSLDUP:
-    case X86ISD::PALIGN:
-      return SDValue(); // Not yet implemented.
-    default: llvm_unreachable("unknown target shuffle node");
-    }
-
-    Index = ShuffleMask[Index];
-    if (Index < 0)
+    if (!getTargetShuffleMask(N, VT, ShuffleMask, IsUnary))
+      return SDValue();
+
+    int Elt = ShuffleMask[Index];
+    if (Elt < 0)
       return DAG.getUNDEF(VT.getVectorElementType());
 
-    SDValue NewV = (Index < (int)NumElems) ? N->getOperand(0)
+    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
                                            : N->getOperand(1);
-    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                Depth+1);
   }
 
@@ -4453,7 +4474,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
 
   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
     return (Index == 0) ? V.getOperand(0)
-                          : DAG.getUNDEF(VT.getVectorElementType());
+                        : DAG.getUNDEF(VT.getVectorElementType());
 
   if (V.getOpcode() == ISD::BUILD_VECTOR)
     return V.getOperand(Index);
@@ -4465,38 +4486,37 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
 /// shuffle operation which come from a consecutively from a zero. The
 /// search can start in two different directions, from left or right.
 static
-unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
+unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
                                   bool ZerosFromLeft, SelectionDAG &DAG) {
-  int i = 0;
-
-  while (i < NumElems) {
+  unsigned i;
+  for (i = 0; i != NumElems; ++i) {
     unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
-    SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0);
+    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
     if (!(Elt.getNode() &&
          (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
       break;
-    ++i;
   }
 
   return i;
 }
 
-/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to
-/// MaskE correspond consecutively to elements from one of the vector operands,
+/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
+/// correspond consecutively to elements from one of the vector operands,
 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
 static
-bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE,
-                              int OpIdx, int NumElems, unsigned &OpNum) {
+bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
+                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
+                              unsigned NumElems, unsigned &OpNum) {
   bool SeenV1 = false;
   bool SeenV2 = false;
 
-  for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) {
+  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
     int Idx = SVOp->getMaskElt(i);
     // Ignore undef indicies
     if (Idx < 0)
       continue;
 
-    if (Idx < NumElems)
+    if (Idx < (int)NumElems)
       SeenV1 = true;
     else
       SeenV2 = true;
@@ -4531,7 +4551,7 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   //
   if (!isShuffleMaskConsecutive(SVOp,
             0,                   // Mask Start Index
-            NumElems-NumZeros-1, // Mask End Index
+            NumElems-NumZeros,   // Mask End Index(exclusive)
             NumZeros,            // Where to start looking in the src vector
             NumElems,            // Number of elements in vector
             OpSrc))              // Which source operand ?
@@ -4564,7 +4584,7 @@ static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
   //
   if (!isShuffleMaskConsecutive(SVOp,
             NumZeros,     // Mask Start Index
-            NumElems-1,   // Mask End Index
+            NumElems,     // Mask End Index(exclusive)
             0,            // Where to start looking in the src vector
             NumElems,     // Number of elements in vector
             OpSrc))       // Which source operand ?
@@ -6080,88 +6100,6 @@ static bool RelaxedMayFoldVectorLoad(SDValue V) {
   return false;
 }
 
-/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by
-/// a vector extract, and if both can be later optimized into a single load.
-/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked
-/// here because otherwise a target specific shuffle node is going to be
-/// emitted for this shuffle, and the optimization not done.
-/// FIXME: This is probably not the best approach, but fix the problem
-/// until the right path is decided.
-static
-bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
-                                         const TargetLowering &TLI) {
-  EVT VT = V.getValueType();
-  ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V);
-
-  // Be sure that the vector shuffle is present in a pattern like this:
-  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr)
-  if (!V.hasOneUse())
-    return false;
-
-  SDNode *N = *V.getNode()->use_begin();
-  if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-    return false;
-
-  SDValue EltNo = N->getOperand(1);
-  if (!isa<ConstantSDNode>(EltNo))
-    return false;
-
-  // If the bit convert changed the number of elements, it is unsafe
-  // to examine the mask.
-  bool HasShuffleIntoBitcast = false;
-  if (V.getOpcode() == ISD::BITCAST) {
-    EVT SrcVT = V.getOperand(0).getValueType();
-    if (SrcVT.getVectorNumElements() != VT.getVectorNumElements())
-      return false;
-    V = V.getOperand(0);
-    HasShuffleIntoBitcast = true;
-  }
-
-  // Select the input vector, guarding against out of range extract vector.
-  unsigned NumElems = VT.getVectorNumElements();
-  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-  int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt);
-  V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1);
-
-  // If we are accessing the upper part of a YMM register
-  // then the EXTRACT_VECTOR_ELT is likely to be legalized to a sequence of
-  // EXTRACT_SUBVECTOR + EXTRACT_VECTOR_ELT, which are not detected at this point
-  // because the legalization of N did not happen yet.
-  if (Idx >= (int)NumElems/2 && VT.getSizeInBits() == 256)
-    return false;
-
-  // Skip one more bit_convert if necessary
-  if (V.getOpcode() == ISD::BITCAST) {
-    if (!V.hasOneUse())
-      return false;
-    V = V.getOperand(0);
-  }
-
-  if (!ISD::isNormalLoad(V.getNode()))
-    return false;
-
-  // Is the original load suitable?
-  LoadSDNode *LN0 = cast<LoadSDNode>(V);
-
-  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
-    return false;
-
-  if (!HasShuffleIntoBitcast)
-    return true;
-
-  // If there's a bitcast before the shuffle, check if the load type and
-  // alignment is valid.
-  unsigned Align = LN0->getAlignment();
-  unsigned NewAlign =
-    TLI.getTargetData()->getABITypeAlignment(
-                                  VT.getTypeForEVT(*DAG.getContext()));
-
-  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
-    return false;
-
-  return true;
-}
-
 static
 SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
@@ -6282,12 +6220,6 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
   if (SVOp->isSplat()) {
     unsigned NumElem = VT.getVectorNumElements();
     int Size = VT.getSizeInBits();
-    // Special case, this is the only place now where it's allowed to return
-    // a vector_shuffle operation without using a target specific node, because
-    // *hopefully* it will be optimized away by the dag combiner. FIXME: should
-    // this be moved to DAGCombine instead?
-    if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
-      return Op;
 
     // Use vbroadcast whenever the splat comes from a foldable load
     SDValue LD = isVectorBroadcast(Op, Subtarget);
@@ -13005,11 +12937,109 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
+/// specific shuffle of a load can be folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been customed lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue InVec = N->getOperand(0);
+  SDValue EltNo = N->getOperand(1);
+
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
+
+  EVT VT = InVec.getValueType();
+
+  bool HasShuffleIntoBitcast = false;
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
+      return SDValue();
+    InVec = InVec.getOperand(0);
+    HasShuffleIntoBitcast = true;
+  }
+
+  if (!isTargetShuffle(InVec.getOpcode()))
+    return SDValue();
+
+  // Don't duplicate a load with other uses.
+  if (!InVec.hasOneUse())
+    return SDValue();
+
+  SmallVector<int, 16> ShuffleMask;
+  bool UnaryShuffle;
+  if (!getTargetShuffleMask(InVec.getNode(), VT, ShuffleMask, UnaryShuffle))
+    return SDValue();
+
+  // Select the input vector, guarding against out of range extract vector.
+  unsigned NumElems = VT.getVectorNumElements();
+  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
+                                         : InVec.getOperand(1);
+
+  // If inputs to shuffle are the same for both ops, then allow 2 uses
+  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+
+  if (LdNode.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+      return SDValue();
+
+    AllowedUses = 1; // only allow 1 load use if we have a bitcast
+    LdNode = LdNode.getOperand(0);
+  }
+
+  if (!ISD::isNormalLoad(LdNode.getNode()))
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
+
+  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+    return SDValue();
+
+  if (HasShuffleIntoBitcast) {
+    // If there's a bitcast before the shuffle, check if the load type and
+    // alignment is valid.
+    unsigned Align = LN0->getAlignment();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NewAlign = TLI.getTargetData()->
+      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+
+    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
+      return SDValue();
+  }
+
+  // All checks match so transform back to vector_shuffle so that DAG combiner
+  // can finish the job
+  DebugLoc dl = N->getDebugLoc();
+
+  // Create shuffle node taking into account the case that its a unary shuffle
+  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
+  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
+                                 InVec.getOperand(0), Shuffle,
+                                 &ShuffleMask[0]);
+  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+                     EltNo);
+}
+
 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
 /// generation and convert it from being a bunch of shuffles and extracts
 /// to a simple store and scalar loads to extract the elements.
 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
-                                                const TargetLowering &TLI) {
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
+  if (NewOp.getNode())
+    return NewOp;
+
   SDValue InputVector = N->getOperand(0);
 
   // Only operate on vectors of 4 elements, where the alternative shuffling
@@ -13070,6 +13100,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     unsigned EltSize =
         InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
     uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
 
     SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
@@ -13093,6 +13124,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
+
+
   DebugLoc DL = N->getDebugLoc();
   SDValue Cond = N->getOperand(0);
   // Get the LHS/RHS of the select.
@@ -14897,7 +14930,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default: break;
   case ISD::EXTRACT_VECTOR_ELT:
-    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
+    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   case ISD::VSELECT:
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index ac49232..42a5014 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -153,7 +153,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, isCodeGenOnly = 1 in {
 def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
                     "ret\t#eh_return, addr: $addr",
-                    [(X86ehret GR32:$addr)]>;
+                    [(X86ehret GR32:$addr)], IIC_RET>;
 
 }
 
@@ -161,7 +161,7 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, isCodeGenOnly = 1 in {
 def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
                      "ret\t#eh_return, addr: $addr",
-                     [(X86ehret GR64:$addr)]>;
+                     [(X86ehret GR64:$addr)], IIC_RET>;
 
 }
 
@@ -193,7 +193,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
     isCodeGenOnly = 1 in {
 def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "",
-                 [(set GR8:$dst, 0)]>;
+                 [(set GR8:$dst, 0)], IIC_ALU_NONMEM>;
 
 // We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller
 // encoding and avoids a partial-register update sometimes, but doing so
@@ -202,11 +202,11 @@ def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "",
 // to an MCInst.
 def MOV16r0   : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
                  "",
-                 [(set GR16:$dst, 0)]>, OpSize;
+                 [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize;
 
 // FIXME: Set encoding to pseudo.
 def MOV32r0  : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
-                 [(set GR32:$dst, 0)]>;
+                 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>;
 }
 
 // We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a
@@ -218,7 +218,7 @@ def MOV32r0  : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
 let Defs = [EFLAGS], isCodeGenOnly=1,
     AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
 def MOV64r0   : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "",
-                 [(set GR64:$dst, 0)]>;
+                 [(set GR64:$dst, 0)], IIC_ALU_NONMEM>;
 
 // Materialize i64 constant where top 32-bits are zero. This could theoretically
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
@@ -226,7 +226,8 @@ def MOV64r0   : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "",
 let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
     isCodeGenOnly = 1 in
 def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
-                        "", [(set GR64:$dst, i64immZExt32:$src)]>;
+                        "", [(set GR64:$dst, i64immZExt32:$src)],
+                        IIC_ALU_NONMEM>;
 
 // Use sbb to materialize carry bit.
 let Uses = [EFLAGS], Defs = [EFLAGS], isCodeGenOnly = 1 in {
@@ -236,14 +237,18 @@ let Uses = [EFLAGS], Defs = [EFLAGS], isCodeGenOnly = 1 in {
 // FIXME: Change these to have encoding Pseudo when X86MCCodeEmitter replaces
 // X86CodeEmitter.
 def SETB_C8r : I<0x18, MRMInitReg, (outs GR8:$dst), (ins), "",
-                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
+                 IIC_ALU_NONMEM>;
 def SETB_C16r : I<0x19, MRMInitReg, (outs GR16:$dst), (ins), "",
-                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>,
+                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
+                 IIC_ALU_NONMEM>,
                 OpSize;
 def SETB_C32r : I<0x19, MRMInitReg, (outs GR32:$dst), (ins), "",
-                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
+                 IIC_ALU_NONMEM>;
 def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins), "",
-                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
+                 IIC_ALU_NONMEM>;
 } // isCodeGenOnly
 
 
@@ -297,32 +302,32 @@ def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
 //
 let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
 def REP_MOVSB : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
-                  [(X86rep_movs i8)]>, REP;
+                  [(X86rep_movs i8)], IIC_REP_MOVS>, REP;
 def REP_MOVSW : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
-                  [(X86rep_movs i16)]>, REP, OpSize;
+                  [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize;
 def REP_MOVSD : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
-                  [(X86rep_movs i32)]>, REP;
+                  [(X86rep_movs i32)], IIC_REP_MOVS>, REP;
 }
 
 let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in
 def REP_MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
-                   [(X86rep_movs i64)]>, REP;
+                   [(X86rep_movs i64)], IIC_REP_MOVS>, REP;
 
 
 // FIXME: Should use "(X86rep_stos AL)" as the pattern.
 let Defs = [ECX,EDI], Uses = [AL,ECX,EDI], isCodeGenOnly = 1 in
 def REP_STOSB : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
-                  [(X86rep_stos i8)]>, REP;
+                  [(X86rep_stos i8)], IIC_REP_STOS>, REP;
 let Defs = [ECX,EDI], Uses = [AX,ECX,EDI], isCodeGenOnly = 1 in
 def REP_STOSW : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
-                  [(X86rep_stos i16)]>, REP, OpSize;
+                  [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize;
 let Defs = [ECX,EDI], Uses = [EAX,ECX,EDI], isCodeGenOnly = 1 in
 def REP_STOSD : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
-                  [(X86rep_stos i32)]>, REP;
+                  [(X86rep_stos i32)], IIC_REP_STOS>, REP;
 
 let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI], isCodeGenOnly = 1 in
 def REP_STOSQ : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
-                   [(X86rep_stos i64)]>, REP;
+                   [(X86rep_stos i64)], IIC_REP_STOS>, REP;
 
 
 //===----------------------------------------------------------------------===//
@@ -571,7 +576,7 @@ let isCodeGenOnly = 1, Defs = [EFLAGS] in
 def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
                       "lock\n\t"
                       "or{l}\t{$zero, $dst|$dst, $zero}",
-                      []>, Requires<[In32BitMode]>, LOCK;
+                      [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK;
 
 let hasSideEffects = 1 in
 def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
@@ -591,72 +596,72 @@ def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
                    !strconcat("lock\n\t", mnemonic, "{b}\t",
                               "{$src2, $dst|$dst, $src2}"),
-                   []>, LOCK;
+                   [], IIC_ALU_NONMEM>, LOCK;
 def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                     !strconcat("lock\n\t", mnemonic, "{w}\t",
                                "{$src2, $dst|$dst, $src2}"),
-                    []>, OpSize, LOCK;
+                    [], IIC_ALU_NONMEM>, OpSize, LOCK;
 def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                     !strconcat("lock\n\t", mnemonic, "{l}\t",
                                "{$src2, $dst|$dst, $src2}"),
-                    []>, LOCK;
+                    [], IIC_ALU_NONMEM>, LOCK;
 def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                      RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                      MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                      !strconcat("lock\n\t", mnemonic, "{q}\t",
                                 "{$src2, $dst|$dst, $src2}"),
-                     []>, LOCK;
+                     [], IIC_ALU_NONMEM>, LOCK;
 
 def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
                      ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
                      !strconcat("lock\n\t", mnemonic, "{b}\t",
                                 "{$src2, $dst|$dst, $src2}"),
-                     []>, LOCK;
+                     [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                        ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                        ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
                        !strconcat("lock\n\t", mnemonic, "{w}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       []>, LOCK;
+                       [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                        ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                        ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
                        !strconcat("lock\n\t", mnemonic, "{l}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       []>, LOCK;
+                       [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                           ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                           ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
                           !strconcat("lock\n\t", mnemonic, "{q}\t",
                                      "{$src2, $dst|$dst, $src2}"),
-                          []>, LOCK;
+                          [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
                        !strconcat("lock\n\t", mnemonic, "{w}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       []>, LOCK;
+                       [], IIC_ALU_MEM>, LOCK;
 def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
                        !strconcat("lock\n\t", mnemonic, "{l}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       []>, LOCK;
+                       [], IIC_ALU_MEM>, LOCK;
 def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                         ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                         ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
                         !strconcat("lock\n\t", mnemonic, "{q}\t",
                                    "{$src2, $dst|$dst, $src2}"),
-                        []>, LOCK;
+                        [], IIC_ALU_MEM>, LOCK;
 
 }
 
@@ -673,29 +678,29 @@ let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
 
 def LOCK_INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
                     "lock\n\t"
-                    "inc{b}\t$dst", []>, LOCK;
+                    "inc{b}\t$dst", [], IIC_UNARY_MEM>, LOCK;
 def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst),
                     "lock\n\t"
-                    "inc{w}\t$dst", []>, OpSize, LOCK;
+                    "inc{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK;
 def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst),
                     "lock\n\t"
-                    "inc{l}\t$dst", []>, LOCK;
+                    "inc{l}\t$dst", [], IIC_UNARY_MEM>, LOCK;
 def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
                      "lock\n\t"
-                     "inc{q}\t$dst", []>, LOCK;
+                     "inc{q}\t$dst", [], IIC_UNARY_MEM>, LOCK;
 
 def LOCK_DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst),
                     "lock\n\t"
-                    "dec{b}\t$dst", []>, LOCK;
+                    "dec{b}\t$dst", [], IIC_UNARY_MEM>, LOCK;
 def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
                     "lock\n\t"
-                    "dec{w}\t$dst", []>, OpSize, LOCK;
+                    "dec{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK;
 def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
                     "lock\n\t"
-                    "dec{l}\t$dst", []>, LOCK;
+                    "dec{l}\t$dst", [], IIC_UNARY_MEM>, LOCK;
 def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
                       "lock\n\t"
-                      "dec{q}\t$dst", []>, LOCK;
+                      "dec{q}\t$dst", [], IIC_UNARY_MEM>, LOCK;
 }
 
 // Atomic compare and swap.
@@ -704,42 +709,42 @@ let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
 def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
                "lock\n\t"
                "cmpxchg8b\t$ptr",
-               [(X86cas8 addr:$ptr)]>, TB, LOCK;
+               [(X86cas8 addr:$ptr)], IIC_CMPX_LOCK_8B>, TB, LOCK;
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
     isCodeGenOnly = 1 in
 def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
                     "lock\n\t"
                     "cmpxchg16b\t$ptr",
-                    [(X86cas16 addr:$ptr)]>, TB, LOCK,
+                    [(X86cas16 addr:$ptr)], IIC_CMPX_LOCK_16B>, TB, LOCK,
                     Requires<[HasCmpxchg16b]>;
 
 let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in {
 def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
                "lock\n\t"
                "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR8:$swap, 1)]>, TB, LOCK;
+               [(X86cas addr:$ptr, GR8:$swap, 1)], IIC_CMPX_LOCK_8>, TB, LOCK;
 }
 
 let Defs = [AX, EFLAGS], Uses = [AX], isCodeGenOnly = 1 in {
 def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap),
                "lock\n\t"
                "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR16:$swap, 2)]>, TB, OpSize, LOCK;
+               [(X86cas addr:$ptr, GR16:$swap, 2)], IIC_CMPX_LOCK>, TB, OpSize, LOCK;
 }
 
 let Defs = [EAX, EFLAGS], Uses = [EAX], isCodeGenOnly = 1 in {
 def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
                "lock\n\t"
                "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR32:$swap, 4)]>, TB, LOCK;
+               [(X86cas addr:$ptr, GR32:$swap, 4)], IIC_CMPX_LOCK>, TB, LOCK;
 }
 
 let Defs = [RAX, EFLAGS], Uses = [RAX], isCodeGenOnly = 1 in {
 def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
                "lock\n\t"
                "cmpxchg{q}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
+               [(X86cas addr:$ptr, GR64:$swap, 8)], IIC_CMPX_LOCK>, TB, LOCK;
 }
 
 // Atomic exchange and add
@@ -747,22 +752,26 @@ let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in {
 def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
                "lock\n\t"
                "xadd{b}\t{$val, $ptr|$ptr, $val}",
-               [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))]>,
+               [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))],
+                IIC_XADD_LOCK_MEM8>,
                 TB, LOCK;
 def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr),
                "lock\n\t"
                "xadd{w}\t{$val, $ptr|$ptr, $val}",
-               [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))]>,
+               [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))],
+                IIC_XADD_LOCK_MEM>,
                 TB, OpSize, LOCK;
 def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr),
                "lock\n\t"
                "xadd{l}\t{$val, $ptr|$ptr, $val}",
-               [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))]>,
+               [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))],
+                IIC_XADD_LOCK_MEM>,
                 TB, LOCK;
 def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr),
                "lock\n\t"
                "xadd{q}\t{$val, $ptr|$ptr, $val}",
-               [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>,
+               [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))],
+                IIC_XADD_LOCK_MEM>,
                 TB, LOCK;
 }
 
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 4f9f089..ae3ed1b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -218,6 +218,11 @@ def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
+// Like 'X86vzload', but always requires 128-bit vector alignment.
+def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{
+  return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
 // Like 'load', but always requires 256-bit vector alignment.
 def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 32;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 5a479f0..307c96b 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -25,13 +25,13 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include <limits>
 
 #define GET_INSTRINFO_CTOR
@@ -82,6 +82,12 @@ enum {
   TB_FOLDED_STORE = 1 << 19
 };
 
+struct X86OpTblEntry {
+  uint16_t RegOp;
+  uint16_t MemOp;
+  uint32_t Flags;
+};
+
 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
                      ? X86::ADJCALLSTACKDOWN64
@@ -91,7 +97,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
                      : X86::ADJCALLSTACKUP32)),
     TM(tm), RI(tm, *this) {
 
-  static const unsigned OpTbl2Addr[][3] = {
+  static const X86OpTblEntry OpTbl2Addr[] = {
     { X86::ADC32ri,     X86::ADC32mi,    0 },
     { X86::ADC32ri8,    X86::ADC32mi8,   0 },
     { X86::ADC32rr,     X86::ADC32mr,    0 },
@@ -259,16 +265,16 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
-    unsigned RegOp = OpTbl2Addr[i][0];
-    unsigned MemOp = OpTbl2Addr[i][1];
-    unsigned Flags = OpTbl2Addr[i][2];
+    unsigned RegOp = OpTbl2Addr[i].RegOp;
+    unsigned MemOp = OpTbl2Addr[i].MemOp;
+    unsigned Flags = OpTbl2Addr[i].Flags;
     AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 0, folded load and store, no alignment requirement.
                   Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
   }
 
-  static const unsigned OpTbl0[][3] = {
+  static const X86OpTblEntry OpTbl0[] = {
     { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
     { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
     { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
@@ -370,14 +376,14 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
-    unsigned RegOp      = OpTbl0[i][0];
-    unsigned MemOp      = OpTbl0[i][1];
-    unsigned Flags      = OpTbl0[i][2];
+    unsigned RegOp      = OpTbl0[i].RegOp;
+    unsigned MemOp      = OpTbl0[i].MemOp;
+    unsigned Flags      = OpTbl0[i].Flags;
     AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
                   RegOp, MemOp, TB_INDEX_0 | Flags);
   }
 
-  static const unsigned OpTbl1[][3] = {
+  static const X86OpTblEntry OpTbl1[] = {
     { X86::CMP16rr,         X86::CMP16rm,             0 },
     { X86::CMP32rr,         X86::CMP32rm,             0 },
     { X86::CMP64rr,         X86::CMP64rm,             0 },
@@ -555,16 +561,16 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
-    unsigned RegOp = OpTbl1[i][0];
-    unsigned MemOp = OpTbl1[i][1];
-    unsigned Flags = OpTbl1[i][2];
+    unsigned RegOp = OpTbl1[i].RegOp;
+    unsigned MemOp = OpTbl1[i].MemOp;
+    unsigned Flags = OpTbl1[i].Flags;
     AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 1, folded load
                   Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
   }
 
-  static const unsigned OpTbl2[][3] = {
+  static const X86OpTblEntry OpTbl2[] = {
     { X86::ADC32rr,         X86::ADC32rm,       0 },
     { X86::ADC64rr,         X86::ADC64rm,       0 },
     { X86::ADD16rr,         X86::ADD16rm,       0 },
@@ -1108,9 +1114,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
-    unsigned RegOp = OpTbl2[i][0];
-    unsigned MemOp = OpTbl2[i][1];
-    unsigned Flags = OpTbl2[i][2];
+    unsigned RegOp = OpTbl2[i].RegOp;
+    unsigned MemOp = OpTbl2[i].MemOp;
+    unsigned Flags = OpTbl2[i].Flags;
     AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 2, folded load
@@ -3627,7 +3633,7 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
 // These are the replaceable SSE instructions. Some of these have Int variants
 // that we don't include here. We don't want to replace instructions selected
 // by intrinsics.
-static const unsigned ReplaceableInstrs[][3] = {
+static const uint16_t ReplaceableInstrs[][3] = {
   //PackedSingle     PackedDouble    PackedInt
   { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
   { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
@@ -3667,7 +3673,7 @@ static const unsigned ReplaceableInstrs[][3] = {
   { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr }
 };
 
-static const unsigned ReplaceableInstrsAVX2[][3] = {
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
   //PackedSingle       PackedDouble       PackedInt
   { X86::VANDNPSYrm,   X86::VANDNPDYrm,   X86::VPANDNYrm   },
   { X86::VANDNPSYrr,   X86::VANDNPDYrr,   X86::VPANDNYrr   },
@@ -3688,14 +3694,14 @@ static const unsigned ReplaceableInstrsAVX2[][3] = {
 // FIXME: Some shuffle and unpack instructions have equivalents in different
 // domains, but they require a bit more work than just switching opcodes.
 
-static const unsigned *lookup(unsigned opcode, unsigned domain) {
+static const uint16_t *lookup(unsigned opcode, unsigned domain) {
   for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
     if (ReplaceableInstrs[i][domain-1] == opcode)
       return ReplaceableInstrs[i];
   return 0;
 }
 
-static const unsigned *lookupAVX2(unsigned opcode, unsigned domain) {
+static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
   for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
     if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
       return ReplaceableInstrsAVX2[i];
@@ -3718,7 +3724,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   assert(Domain>0 && Domain<4 && "Invalid execution domain");
   uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
-  const unsigned *table = lookup(MI->getOpcode(), dom);
+  const uint16_t *table = lookup(MI->getOpcode(), dom);
   if (!table) { // try the other table
     assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) &&
            "256-bit vector operations only available in AVX2");
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index d065d2d..b23d756 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -14,10 +14,10 @@
 #ifndef X86INSTRUCTIONINFO_H
 #define X86INSTRUCTIONINFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
 #include "X86.h"
 #include "X86RegisterInfo.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "X86GenInstrInfo.inc"
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index f585b47..dd7cf50 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1856,19 +1856,19 @@ def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
 def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
 
 // shld/shrd op,op -> shld op, op, CL
-def : InstAlias<"shldw $r1, $r2", (SHLD16rrCL GR16:$r1, GR16:$r2)>;
-def : InstAlias<"shldl $r1, $r2", (SHLD32rrCL GR32:$r1, GR32:$r2)>;
-def : InstAlias<"shldq $r1, $r2", (SHLD64rrCL GR64:$r1, GR64:$r2)>;
-def : InstAlias<"shrdw $r1, $r2", (SHRD16rrCL GR16:$r1, GR16:$r2)>;
-def : InstAlias<"shrdl $r1, $r2", (SHRD32rrCL GR32:$r1, GR32:$r2)>;
-def : InstAlias<"shrdq $r1, $r2", (SHRD64rrCL GR64:$r1, GR64:$r2)>;
-
-def : InstAlias<"shldw $mem, $reg", (SHLD16mrCL i16mem:$mem, GR16:$reg)>;
-def : InstAlias<"shldl $mem, $reg", (SHLD32mrCL i32mem:$mem, GR32:$reg)>;
-def : InstAlias<"shldq $mem, $reg", (SHLD64mrCL i64mem:$mem, GR64:$reg)>;
-def : InstAlias<"shrdw $mem, $reg", (SHRD16mrCL i16mem:$mem, GR16:$reg)>;
-def : InstAlias<"shrdl $mem, $reg", (SHRD32mrCL i32mem:$mem, GR32:$reg)>;
-def : InstAlias<"shrdq $mem, $reg", (SHRD64mrCL i64mem:$mem, GR64:$reg)>;
+def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>;
+def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>;
+def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>;
+def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>;
+def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>;
+def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>;
+
+def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>;
+def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>;
+def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>;
+def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>;
+def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>;
+def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>;
 
 /*  FIXME: This is disabled because the asm matcher is currently incapable of
  *  matching a fixed immediate like $1.
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index c6d1d19..df42627 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -641,7 +641,7 @@ let Predicates = [HasAVX] in {
               (VMOVSDrr (v2i64 (V_SET0)),
                         (EXTRACT_SUBREG (v4i64 VR256:$src), sub_sd)), sub_xmm)>;
 
-// Extract and store.
+  // Extract and store.
   def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
                    addr:$dst),
             (VMOVSSmr addr:$dst,
@@ -2306,7 +2306,7 @@ let Defs = [EFLAGS] in {
                                   "comisd", SSEPackedDouble>, TB, OpSize;
 } // Defs = [EFLAGS]
 
-// sse12_cmp_packed - sse 1 & 2 compared packed instructions
+// sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             Intrinsic Int, string asm, string asm_alt,
                             Domain d> {
@@ -4820,8 +4820,10 @@ let Predicates = [HasSSE2], AddedComplexity = 20 in {
 }
 
 let Predicates = [HasAVX] in {
-def : Pat<(v4i64 (X86vzload addr:$src)),
+def : Pat<(v4i64 (alignedX86vzload addr:$src)),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzload addr:$src)),
+          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -7307,6 +7309,24 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           []>, VEX;
 }
 
+// Extract and store.
+let Predicates = [HasAVX] in {
+  def : Pat<(alignedstore (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
+  def : Pat<(alignedstore (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
+  def : Pat<(alignedstore (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
+
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2)),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2)),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
+  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, (bc_v16i8 (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2))),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
+}
+
+// AVX1 patterns
 let Predicates = [HasAVX] in {
 def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
           (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
@@ -7314,6 +7334,31 @@ def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
           (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
 def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
           (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v4f32 (VEXTRACTF128rr
+                    (v8f32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v2f64 (VEXTRACTF128rr
+                    (v4f64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v2i64 (VEXTRACTF128rr
+                    (v4i64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v4i32 (VEXTRACTF128rr
+                    (v8i32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v8i16 (VEXTRACTF128rr
+                    (v16i16 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v16i8 (VEXTRACTF128rr
+                    (v32i8 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7711,7 +7756,7 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
             (int_x86_avx2_vinserti128 VR256:$src1, (memopv2i64 addr:$src2),
              imm:$src3))]>, VEX_4V;
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2], AddedComplexity = 1 in {
 def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
                                    (i32 imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
@@ -7756,6 +7801,19 @@ def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
                                    (i32 imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7791,34 +7849,6 @@ def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
 }
 
-// AVX1 patterns
-let Predicates = [HasAVX] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v4f32 (VEXTRACTF128rr
-                    (v8f32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v2f64 (VEXTRACTF128rr
-                    (v4f64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v2i64 (VEXTRACTF128rr
-                    (v4i64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v4i32 (VEXTRACTF128rr
-                    (v8i32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v8i16 (VEXTRACTF128rr
-                    (v16i16 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v16i8 (VEXTRACTF128rr
-                    (v32i8 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-}
-
 //===----------------------------------------------------------------------===//
 // VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
 //
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 8843848..bddba6c 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -45,17 +45,17 @@ def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
 
 
 def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB;
-def SYSRETL  : I<0x07, RawFrm, (outs), (ins), "sysretl", []>, TB;
-def SYSRETQ  :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB,
+def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", []>, TB,
                Requires<[In64BitMode]>;
 
 def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB;
                  
-def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexitl", []>, TB;
-def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexitq", []>, TB,
+def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB,
                 Requires<[In64BitMode]>;
 
-def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iretw", []>, OpSize;
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", []>, OpSize;
 def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>;
 def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>,
              Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index a7a5c56..b578e8d 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -12,10 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstPrinter/X86ATTInstPrinter.h"
 #include "X86MCInstLower.h"
 #include "X86AsmPrinter.h"
 #include "X86COFFMachineModuleInfo.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "llvm/Type.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -26,7 +27,6 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Type.h"
 using namespace llvm;
 
 X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf,
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 93e2744..b56025f 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86.h"
 #include "X86RegisterInfo.h"
+#include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index d6d0149..17f4efd 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -114,6 +114,9 @@ def IIC_MOVZX : InstrItinClass;
 def IIC_MOVZX_R16_R8 : InstrItinClass;
 def IIC_MOVZX_R16_M8 : InstrItinClass;
 
+def IIC_REP_MOVS : InstrItinClass;
+def IIC_REP_STOS : InstrItinClass;
+
 // SSE scalar/parallel binary operations
 def IIC_SSE_ALU_F32S_RR : InstrItinClass;
 def IIC_SSE_ALU_F32S_RM : InstrItinClass;
@@ -250,6 +253,14 @@ def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
 def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
 def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
 
+def IIC_CMPX_LOCK : InstrItinClass;
+def IIC_CMPX_LOCK_8 : InstrItinClass;
+def IIC_CMPX_LOCK_8B : InstrItinClass;
+def IIC_CMPX_LOCK_16B : InstrItinClass;
+
+def IIC_XADD_LOCK_MEM : InstrItinClass;
+def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index e8cf72a..77d4e56 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -144,6 +144,9 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
   InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
 
+  InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
+  InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
+
   // SSE binary operations
   // arithmetic fp scalar
   InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
@@ -289,6 +292,14 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >
-]>;
+  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >
+  ]>;
 
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index a36d0d8..7fd832b 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -14,9 +14,9 @@
 #ifndef X86SUBTARGET_H
 #define X86SUBTARGET_H
 
+#include "llvm/CallingConv.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/CallingConv.h"
 #include <string>
 
 #define GET_SUBTARGETINFO_HEADER
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 143caba..8e935af 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -28,7 +28,6 @@
 
 namespace llvm {
 
-class formatted_raw_ostream;
 class StringRef;
 
 class X86TargetMachine : public LLVMTargetMachine {
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index ceb7a4a..a02a368 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -15,7 +15,6 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
-  class X86TargetMachine;
 
   /// X8664_MachoTargetObjectFile - This TLOF implementation is used for Darwin
   /// x86-64.
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 4d8ef74..50fda58 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "XCore.h"
 #include "XCoreFrameLowering.h"
+#include "XCore.h"
 #include "XCoreInstrInfo.h"
 #include "XCoreMachineFunctionInfo.h"
 #include "llvm/Function.h"
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index c2d2a5d..593cebc 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -1152,7 +1152,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
   if (isVarArg) {
     /* Argument registers */
-    static const unsigned ArgRegs[] = {
+    static const uint16_t ArgRegs[] = {
       XCore::R0, XCore::R1, XCore::R2, XCore::R3
     };
     XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index f5a6822..5cd3e67 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -15,9 +15,9 @@
 #ifndef XCOREISELLOWERING_H
 #define XCOREISELLOWERING_H
 
+#include "XCore.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
-#include "XCore.h"
 
 namespace llvm {
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index f930623..0a3008d 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "XCoreMachineFunctionInfo.h"
 #include "XCoreInstrInfo.h"
+#include "XCoreMachineFunctionInfo.h"
 #include "XCore.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index e47d212..42eeed8 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -14,8 +14,8 @@
 #ifndef XCOREINSTRUCTIONINFO_H
 #define XCOREINSTRUCTIONINFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
 #include "XCoreRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "XCoreGenInstrInfo.inc"
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 8730282..f3b4b4c 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -14,6 +14,8 @@
 #include "XCoreRegisterInfo.h"
 #include "XCoreMachineFunctionInfo.h"
 #include "XCore.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -24,8 +26,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Type.h"
-#include "llvm/Function.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
@@ -54,20 +54,6 @@ static inline bool isImmU16(unsigned val) {
   return val < (1 << 16);
 }
 
-static const unsigned XCore_ArgRegs[] = {
-  XCore::R0, XCore::R1, XCore::R2, XCore::R3
-};
-
-const unsigned * XCoreRegisterInfo::getArgRegs(const MachineFunction *MF)
-{
-  return XCore_ArgRegs;
-}
-
-unsigned XCoreRegisterInfo::getNumArgRegs(const MachineFunction *MF)
-{
-  return array_lengthof(XCore_ArgRegs);
-}
-
 bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
   return MF.getMMI().hasDebugInfo() ||
     MF.getFunction()->needsUnwindTableEntry();
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index ab6ce56..7391cfd 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -62,15 +62,6 @@ public:
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
-  //! Return the array of argument passing registers
-  /*!
-    \note The size of this array is returned by getArgRegsSize().
-    */
-  static const unsigned *getArgRegs(const MachineFunction *MF = 0);
-
-  //! Return the size of the argument passing register array
-  static unsigned getNumArgRegs(const MachineFunction *MF = 0);
-  
   //! Return whether to emit frame moves
   static bool needsFrameMoves(const MachineFunction &MF);
 };
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 2c174f4..2546681 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -14,13 +14,13 @@
 #ifndef XCORETARGETMACHINE_H
 #define XCORETARGETMACHINE_H
 
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
 #include "XCoreFrameLowering.h"
 #include "XCoreSubtarget.h"
 #include "XCoreInstrInfo.h"
 #include "XCoreISelLowering.h"
 #include "XCoreSelectionDAGInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
 
 namespace llvm {
 
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 58ab567..a32e550 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2561,11 +2561,6 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           return false;
         delete ValueStack.pop_back_val();
         InstResult = RetVal;
-
-        if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
-          NextBB = II->getNormalDest();
-          return true;
-        }
       }
     } else if (isa<TerminatorInst>(CurInst)) {
       if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
@@ -2582,8 +2577,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         ConstantInt *Val =
           dyn_cast<ConstantInt>(getVal(SI->getCondition()));
         if (!Val) return false;  // Cannot determine.
-        unsigned ValTISucc = SI->resolveSuccessorIndex(SI->findCaseValue(Val));
-        NextBB = SI->getSuccessor(ValTISucc);
+        NextBB = SI->findCaseValue(Val).getCaseSuccessor();
       } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
         Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
         if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
@@ -2611,6 +2605,12 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       setVal(CurInst, InstResult);
     }
 
+    // If we just processed an invoke, we finished evaluating the block.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
+      NextBB = II->getNormalDest();
+      return true;
+    }
+
     // Advance program counter.
     ++CurInst;
   }
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 749a3fa..3c7fac6 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -32,8 +32,6 @@ namespace {
 
   // AlwaysInliner only inlines functions that are mark as "always inline".
   class AlwaysInliner : public Inliner {
-    // Functions that are never inlined
-    SmallPtrSet<const Function*, 16> NeverInline;
     InlineCostAnalyzer CA;
   public:
     // Use extremely low threshold.
@@ -46,7 +44,22 @@ namespace {
     }
     static char ID; // Pass identification, replacement for typeid
     InlineCost getInlineCost(CallSite CS) {
-      return CA.getInlineCost(CS, NeverInline);
+      Function *Callee = CS.getCalledFunction();
+      // We assume indirect calls aren't calling an always-inline function.
+      if (!Callee) return InlineCost::getNever();
+
+      // We can't inline calls to external functions.
+      // FIXME: We shouldn't even get here.
+      if (Callee->isDeclaration()) return InlineCost::getNever();
+
+      // Return never for anything not marked as always inline.
+      if (!Callee->hasFnAttr(Attribute::AlwaysInline))
+        return InlineCost::getNever();
+
+      // We still have to check the inline cost in case there are reasons to
+      // not inline which trump the always-inline attribute such as setjmp and
+      // indirectbr.
+      return CA.getInlineCost(CS);
     }
     float getInlineFudgeFactor(CallSite CS) {
       return CA.getInlineFudgeFactor(CS);
@@ -58,7 +71,7 @@ namespace {
       CA.growCachedCostInfo(Caller, Callee);
     }
     virtual bool doFinalization(CallGraph &CG) {
-      return removeDeadFunctions(CG, &NeverInline);
+      return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
     }
     virtual bool doInitialization(CallGraph &CG);
     void releaseMemory() {
@@ -84,12 +97,5 @@ Pass *llvm::createAlwaysInlinerPass(bool InsertLifetime) {
 // been annotated with the "always inline" attribute.
 bool AlwaysInliner::doInitialization(CallGraph &CG) {
   CA.setTargetData(getAnalysisIfAvailable<TargetData>());
-
-  Module &M = CG.getModule();
-
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!I->isDeclaration() && !I->hasFnAttr(Attribute::AlwaysInline))
-      NeverInline.insert(I);
-
   return false;
 }
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index b3421eb..03032e6 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -23,15 +23,12 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/ADT/SmallPtrSet.h"
 
 using namespace llvm;
 
 namespace {
 
   class SimpleInliner : public Inliner {
-    // Functions that are never inlined
-    SmallPtrSet<const Function*, 16> NeverInline;
     InlineCostAnalyzer CA;
   public:
     SimpleInliner() : Inliner(ID) {
@@ -43,7 +40,7 @@ namespace {
     }
     static char ID; // Pass identification, replacement for typeid
     InlineCost getInlineCost(CallSite CS) {
-      return CA.getInlineCost(CS, NeverInline);
+      return CA.getInlineCost(CS);
     }
     float getInlineFudgeFactor(CallSite CS) {
       return CA.getInlineFudgeFactor(CS);
@@ -78,44 +75,6 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
 // annotated with the noinline attribute.
 bool SimpleInliner::doInitialization(CallGraph &CG) {
   CA.setTargetData(getAnalysisIfAvailable<TargetData>());
-
-  Module &M = CG.getModule();
-
-  for (Module::iterator I = M.begin(), E = M.end();
-       I != E; ++I)
-    if (!I->isDeclaration() && I->hasFnAttr(Attribute::NoInline))
-      NeverInline.insert(I);
-
-  // Get llvm.noinline
-  GlobalVariable *GV = M.getNamedGlobal("llvm.noinline");
-
-  if (GV == 0)
-    return false;
-
-  // Don't crash on invalid code
-  if (!GV->hasDefinitiveInitializer())
-    return false;
-
-  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-
-  if (InitList == 0)
-    return false;
-
-  // Iterate over each element and add to the NeverInline set
-  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
-
-    // Get Source
-    const Constant *Elt = InitList->getOperand(i);
-
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(Elt))
-      if (CE->getOpcode() == Instruction::BitCast)
-        Elt = CE->getOperand(0);
-
-    // Insert into set of functions to never inline
-    if (const Function *F = dyn_cast<Function>(Elt))
-      NeverInline.insert(F);
-  }
-
   return false;
 }
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 1f7625d..9975333 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -243,13 +244,20 @@ bool Inliner::shouldInline(CallSite CS) {
     return false;
   }
   
-  // Try to detect the case where the current inlining candidate caller
-  // (call it B) is a static function and is an inlining candidate elsewhere,
-  // and the current candidate callee (call it C) is large enough that
-  // inlining it into B would make B too big to inline later.  In these
-  // circumstances it may be best not to inline C into B, but to inline B
-  // into its callers.
-  if (Caller->hasLocalLinkage()) {
+  // Try to detect the case where the current inlining candidate caller (call
+  // it B) is a static or linkonce-ODR function and is an inlining candidate
+  // elsewhere, and the current candidate callee (call it C) is large enough
+  // that inlining it into B would make B too big to inline later. In these
+  // circumstances it may be best not to inline C into B, but to inline B into
+  // its callers.
+  //
+  // This only applies to static and linkonce-ODR functions because those are
+  // expected to be available for inlining in the translation units where they
+  // are used. Thus we will always have the opportunity to make local inlining
+  // decisions. Importantly the linkonce-ODR linkage covers inline functions
+  // and templates in C++.
+  if (Caller->hasLocalLinkage() ||
+      Caller->getLinkage() == GlobalValue::LinkOnceODRLinkage) {
     int TotalSecondaryCost = 0;
     bool outerCallsFound = false;
     // This bool tracks what happens if we do NOT inline C into B.
@@ -327,6 +335,37 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
   return false;
 }
 
+/// \brief Simplify arguments going into a particular callsite.
+///
+/// This is important to do each time we add a callsite due to inlining so that
+/// constants and other entities which feed into inline cost estimation are
+/// properly recognized when analyzing the new callsite. Consider:
+///   void outer(int x) {
+///     if (x < 42)
+///       return inner(42 - x);
+///     ...
+///   }
+///   void inner(int x) {
+///     ...
+///   }
+///
+/// The inliner gives calls to 'outer' with a constant argument a bonus because
+/// it will delete one side of a branch. But the resulting call to 'inner'
+/// will, after inlining, also have a constant operand. We need to do just
+/// enough constant folding to expose this for callsite arguments. The rest
+/// will be taken care of after the inliner finishes running.
+static void simplifyCallSiteArguments(const TargetData *TD, CallSite CS) {
+  // FIXME: It would be nice to avoid this smallvector if RAUW doesn't
+  // invalidate operand iterators in any cases.
+  SmallVector<std::pair<Value *, Value*>, 4> SimplifiedArgs;
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I)
+    if (Instruction *Inst = dyn_cast<Instruction>(*I))
+      if (Value *SimpleArg = SimplifyInstruction(Inst, TD))
+        SimplifiedArgs.push_back(std::make_pair(Inst, SimpleArg));
+  for (unsigned Idx = 0, Size = SimplifiedArgs.size(); Idx != Size; ++Idx)
+    SimplifiedArgs[Idx].first->replaceAllUsesWith(SimplifiedArgs[Idx].second);
+}
 
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraph>();
@@ -455,7 +494,9 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
           for (unsigned i = 0, e = InlineInfo.InlinedCalls.size();
                i != e; ++i) {
             Value *Ptr = InlineInfo.InlinedCalls[i];
-            CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID));
+            CallSite NewCS = Ptr;
+            simplifyCallSiteArguments(TD, NewCS);
+            CallSites.push_back(std::make_pair(NewCS, NewHistoryID));
           }
         }
         
@@ -515,25 +556,27 @@ bool Inliner::doFinalization(CallGraph &CG) {
 
 /// removeDeadFunctions - Remove dead functions that are not included in
 /// DNR (Do Not Remove) list.
-bool Inliner::removeDeadFunctions(CallGraph &CG, 
-                                  SmallPtrSet<const Function *, 16> *DNR) {
-  SmallPtrSet<CallGraphNode*, 16> FunctionsToRemove;
+bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
+  SmallVector<CallGraphNode*, 16> FunctionsToRemove;
 
   // Scan for all of the functions, looking for ones that should now be removed
   // from the program.  Insert the dead ones in the FunctionsToRemove set.
   for (CallGraph::iterator I = CG.begin(), E = CG.end(); I != E; ++I) {
     CallGraphNode *CGN = I->second;
-    if (CGN->getFunction() == 0)
-      continue;
-    
     Function *F = CGN->getFunction();
-    
+    if (!F || F->isDeclaration())
+      continue;
+
+    // Handle the case when this function is called and we only want to care
+    // about always-inline functions. This is a bit of a hack to share code
+    // between here and the InlineAlways pass.
+    if (AlwaysInlineOnly && !F->hasFnAttr(Attribute::AlwaysInline))
+      continue;
+
     // If the only remaining users of the function are dead constants, remove
     // them.
     F->removeDeadConstantUsers();
 
-    if (DNR && DNR->count(F))
-      continue;
     if (!F->isDefTriviallyDead())
       continue;
     
@@ -546,24 +589,28 @@ bool Inliner::removeDeadFunctions(CallGraph &CG,
     CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN);
 
     // Removing the node for callee from the call graph and delete it.
-    FunctionsToRemove.insert(CGN);
+    FunctionsToRemove.push_back(CGN);
   }
+  if (FunctionsToRemove.empty())
+    return false;
 
   // Now that we know which functions to delete, do so.  We didn't want to do
   // this inline, because that would invalidate our CallGraph::iterator
   // objects. :(
   //
-  // Note that it doesn't matter that we are iterating over a non-stable set
+  // Note that it doesn't matter that we are iterating over a non-stable order
   // here to do this, it doesn't matter which order the functions are deleted
   // in.
-  bool Changed = false;
-  for (SmallPtrSet<CallGraphNode*, 16>::iterator I = FunctionsToRemove.begin(),
-       E = FunctionsToRemove.end(); I != E; ++I) {
+  std::sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
+  FunctionsToRemove.erase(std::unique(FunctionsToRemove.begin(),
+                                      FunctionsToRemove.end()),
+                          FunctionsToRemove.end());
+  for (SmallVectorImpl<CallGraphNode *>::iterator I = FunctionsToRemove.begin(),
+                                                  E = FunctionsToRemove.end();
+       I != E; ++I) {
     resetCachedCostInfo((*I)->getFunction());
     delete CG.removeFunctionFromModule(*I);
     ++NumDeleted;
-    Changed = true;
   }
-
-  return Changed;
+  return true;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index cc8f5bf..1165660 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1929,8 +1929,11 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       }
 
   // Canonicalize xor to the RHS.
-  if (match(Op0, m_Xor(m_Value(), m_Value())))
+  bool SwappedForXor = false;
+  if (match(Op0, m_Xor(m_Value(), m_Value()))) {
     std::swap(Op0, Op1);
+    SwappedForXor = true;
+  }
 
   // A | ( A ^ B) -> A |  B
   // A | (~A ^ B) -> A | ~B
@@ -1961,6 +1964,9 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
         return BinaryOperator::CreateOr(Not, Op0);
       }
 
+  if (SwappedForXor)
+    std::swap(Op0, Op1);
+
   if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
     if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
       if (Value *Res = FoldOrOfICmps(LHS, RHS))
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 32009c3..99a02fc 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -55,9 +55,9 @@ public:
     Worklist.reserve(NumEntries+16);
     WorklistMap.resize(NumEntries);
     DEBUG(errs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
-    for (; NumEntries; --NumEntries) {
+    for (unsigned Idx = 0; NumEntries; --NumEntries) {
       Instruction *I = List[NumEntries-1];
-      WorklistMap.insert(std::make_pair(I, Worklist.size()));
+      WorklistMap.insert(std::make_pair(I, Idx++));
       Worklist.push_back(I);
     }
   }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 318256a..349ba83 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1245,15 +1245,15 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
     if (I->getOpcode() == Instruction::Add)
       if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
         // change 'switch (X+4) case 1:' into 'switch (X) case -3'
-        unsigned NumCases = SI.getNumCases();
         // Skip the first item since that's the default case.
-        for (unsigned i = 0; i < NumCases; ++i) {
-          ConstantInt* CaseVal = SI.getCaseValue(i);
+        for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
+             i != e; ++i) {
+          ConstantInt* CaseVal = i.getCaseValue();
           Constant* NewCaseVal = ConstantExpr::getSub(cast<Constant>(CaseVal),
                                                       AddRHS);
           assert(isa<ConstantInt>(NewCaseVal) &&
                  "Result of expression should be constant");
-          SI.setCaseValue(i, cast<ConstantInt>(NewCaseVal));
+          i.setValue(cast<ConstantInt>(NewCaseVal));
         }
         SI.setCondition(I->getOperand(0));
         Worklist.Add(I);
@@ -1873,9 +1873,10 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
       if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
         // See if this is an explicit destination.
-        for (unsigned i = 0, e = SI->getNumCases(); i != e; ++i)
-          if (SI->getCaseValue(i) == Cond) {
-            BasicBlock *ReachableBB = SI->getCaseSuccessor(i);
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i)
+          if (i.getCaseValue() == Cond) {
+            BasicBlock *ReachableBB = i.getCaseSuccessor();
             Worklist.push_back(ReachableBB);
             continue;
           }
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 123e399..b43b9e5 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -15,6 +15,7 @@
 
 #define DEBUG_TYPE "asan"
 
+#include "FunctionBlackList.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallSet.h"
@@ -29,8 +30,6 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/IRBuilder.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetData.h"
@@ -126,21 +125,6 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
 
 namespace {
 
-// Blacklisted functions are not instrumented.
-// The blacklist file contains one or more lines like this:
-// ---
-// fun:FunctionWildCard
-// ---
-// This is similar to the "ignore" feature of ThreadSanitizer.
-// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
-class BlackList {
- public:
-  BlackList(const std::string &Path);
-  bool isIn(const Function &F);
- private:
-  Regex *Functions;
-};
-
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public ModulePass {
   AddressSanitizer();
@@ -195,7 +179,7 @@ struct AddressSanitizer : public ModulePass {
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Instruction *CtorInsertBefore;
-  OwningPtr<BlackList> BL;
+  OwningPtr<FunctionBlackList> BL;
 };
 }  // namespace
 
@@ -470,7 +454,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
     GlobalVariable *G = GlobalsToChange[i];
     PointerType *PtrTy = cast<PointerType>(G->getType());
     Type *Ty = PtrTy->getElementType();
-    uint64_t SizeInBytes = TD->getTypeStoreSizeInBits(Ty) / 8;
+    uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
     uint64_t RightRedzoneSize = RedzoneSize +
         (RedzoneSize - (SizeInBytes % RedzoneSize));
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
@@ -551,7 +535,7 @@ bool AddressSanitizer::runOnModule(Module &M) {
   TD = getAnalysisIfAvailable<TargetData>();
   if (!TD)
     return false;
-  BL.reset(new BlackList(ClBlackListFile));
+  BL.reset(new FunctionBlackList(ClBlackListFile));
 
   CurrentModule = &M;
   C = &(M.getContext());
@@ -595,18 +579,23 @@ bool AddressSanitizer::runOnModule(Module &M) {
   if (ClGlobals)
     Res |= insertGlobalRedzones(M);
 
-  // Tell the run-time the current values of mapping offset and scale.
-  GlobalValue *asan_mapping_offset =
-      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
-                     ConstantInt::get(IntptrTy, MappingOffset),
-                     kAsanMappingOffsetName);
-  GlobalValue *asan_mapping_scale =
-      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
-                         ConstantInt::get(IntptrTy, MappingScale),
-                         kAsanMappingScaleName);
-  // Read these globals, otherwise they may be optimized away.
-  IRB.CreateLoad(asan_mapping_scale, true);
-  IRB.CreateLoad(asan_mapping_offset, true);
+  if (ClMappingOffsetLog >= 0) {
+    // Tell the run-time the current values of mapping offset and scale.
+    GlobalValue *asan_mapping_offset =
+        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                       ConstantInt::get(IntptrTy, MappingOffset),
+                       kAsanMappingOffsetName);
+    // Read the global, otherwise it may be optimized away.
+    IRB.CreateLoad(asan_mapping_offset, true);
+  }
+  if (ClMappingScale) {
+    GlobalValue *asan_mapping_scale =
+        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                           ConstantInt::get(IntptrTy, MappingScale),
+                           kAsanMappingScaleName);
+    // Read the global, otherwise it may be optimized away.
+    IRB.CreateLoad(asan_mapping_scale, true);
+  }
 
 
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
@@ -946,54 +935,3 @@ bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) {
 
   return true;
 }
-
-BlackList::BlackList(const std::string &Path) {
-  Functions = NULL;
-  const char *kFunPrefix = "fun:";
-  if (!ClBlackListFile.size()) return;
-  std::string Fun;
-
-  OwningPtr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFile(ClBlackListFile.c_str(), File)) {
-    report_fatal_error("Can't open blacklist file " + ClBlackListFile + ": " +
-                       EC.message());
-  }
-  MemoryBuffer *Buff = File.take();
-  const char *Data = Buff->getBufferStart();
-  size_t DataLen = Buff->getBufferSize();
-  SmallVector<StringRef, 16> Lines;
-  SplitString(StringRef(Data, DataLen), Lines, "\n\r");
-  for (size_t i = 0, numLines = Lines.size(); i < numLines; i++) {
-    if (Lines[i].startswith(kFunPrefix)) {
-      std::string ThisFunc = Lines[i].substr(strlen(kFunPrefix));
-      std::string ThisFuncRE;
-      // add ThisFunc replacing * with .*
-      for (size_t j = 0, n = ThisFunc.size(); j < n; j++) {
-        if (ThisFunc[j] == '*')
-          ThisFuncRE += '.';
-        ThisFuncRE += ThisFunc[j];
-      }
-      // Check that the regexp is valid.
-      Regex CheckRE(ThisFuncRE);
-      std::string Error;
-      if (!CheckRE.isValid(Error))
-        report_fatal_error("malformed blacklist regex: " + ThisFunc +
-                           ": " + Error);
-      // Append to the final regexp.
-      if (Fun.size())
-        Fun += "|";
-      Fun += ThisFuncRE;
-    }
-  }
-  if (Fun.size()) {
-    Functions = new Regex(Fun);
-  }
-}
-
-bool BlackList::isIn(const Function &F) {
-  if (Functions) {
-    bool Res = Functions->match(F.getName());
-    return Res;
-  }
-  return false;
-}
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index f8dbca3..e4c8cf1 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
   EdgeProfiling.cpp
+  FunctionBlackList.cpp
   GCOVProfiling.cpp
   Instrumentation.cpp
   OptimalEdgeProfiling.cpp
diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.cpp b/lib/Transforms/Instrumentation/FunctionBlackList.cpp
new file mode 100644
index 0000000..188ea4d
--- /dev/null
+++ b/lib/Transforms/Instrumentation/FunctionBlackList.cpp
@@ -0,0 +1,79 @@
+//===-- FunctionBlackList.cpp - blacklist of functions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer 
+// or ThreadSanitizer) to avoid instrumenting some functions based on
+// user-supplied blacklist.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FunctionBlackList.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Function.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+
+namespace llvm {
+
+FunctionBlackList::FunctionBlackList(const std::string &Path) {
+  Functions = NULL;
+  const char *kFunPrefix = "fun:";
+  if (!Path.size()) return;
+  std::string Fun;
+
+  OwningPtr<MemoryBuffer> File;
+  if (error_code EC = MemoryBuffer::getFile(Path.c_str(), File)) {
+    report_fatal_error("Can't open blacklist file " + Path + ": " +
+                       EC.message());
+  }
+  MemoryBuffer *Buff = File.take();
+  const char *Data = Buff->getBufferStart();
+  size_t DataLen = Buff->getBufferSize();
+  SmallVector<StringRef, 16> Lines;
+  SplitString(StringRef(Data, DataLen), Lines, "\n\r");
+  for (size_t i = 0, numLines = Lines.size(); i < numLines; i++) {
+    if (Lines[i].startswith(kFunPrefix)) {
+      std::string ThisFunc = Lines[i].substr(strlen(kFunPrefix));
+      std::string ThisFuncRE;
+      // add ThisFunc replacing * with .*
+      for (size_t j = 0, n = ThisFunc.size(); j < n; j++) {
+        if (ThisFunc[j] == '*')
+          ThisFuncRE += '.';
+        ThisFuncRE += ThisFunc[j];
+      }
+      // Check that the regexp is valid.
+      Regex CheckRE(ThisFuncRE);
+      std::string Error;
+      if (!CheckRE.isValid(Error))
+        report_fatal_error("malformed blacklist regex: " + ThisFunc +
+                           ": " + Error);
+      // Append to the final regexp.
+      if (Fun.size())
+        Fun += "|";
+      Fun += ThisFuncRE;
+    }
+  }
+  if (Fun.size()) {
+    Functions = new Regex(Fun);
+  }
+}
+
+bool FunctionBlackList::isIn(const Function &F) {
+  if (Functions) {
+    bool Res = Functions->match(F.getName());
+    return Res;
+  }
+  return false;
+}
+
+}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.h b/lib/Transforms/Instrumentation/FunctionBlackList.h
new file mode 100644
index 0000000..c1239b9
--- /dev/null
+++ b/lib/Transforms/Instrumentation/FunctionBlackList.h
@@ -0,0 +1,37 @@
+//===-- FunctionBlackList.cpp - blacklist of functions ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions based on
+// user-supplied blacklist.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include <string>
+
+namespace llvm {
+class Function;
+class Regex;
+
+// Blacklisted functions are not instrumented.
+// The blacklist file contains one or more lines like this:
+// ---
+// fun:FunctionWildCard
+// ---
+// This is similar to the "ignore" feature of ThreadSanitizer.
+// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
+class FunctionBlackList {
+ public:
+  FunctionBlackList(const std::string &Path);
+  bool isIn(const Function &F);
+ private:
+  Regex *Functions;
+};
+
+}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index d822535..85fda30 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -21,12 +21,14 @@
 
 #define DEBUG_TYPE "tsan"
 
+#include "FunctionBlackList.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Function.h"
 #include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
@@ -37,6 +39,9 @@
 
 using namespace llvm;
 
+static cl::opt<std::string>  ClBlackListFile("tsan-blacklist",
+       cl::desc("Blacklist file"), cl::Hidden);
+
 namespace {
 /// ThreadSanitizer: instrument the code in module to find races.
 struct ThreadSanitizer : public FunctionPass {
@@ -48,6 +53,7 @@ struct ThreadSanitizer : public FunctionPass {
 
  private:
   TargetData *TD;
+  OwningPtr<FunctionBlackList> BL;
   // Callbacks to run-time library are computed in doInitialization.
   Value *TsanFuncEntry;
   Value *TsanFuncExit;
@@ -76,6 +82,8 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<TargetData>();
   if (!TD)
     return false;
+  BL.reset(new FunctionBlackList(ClBlackListFile));
+
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
   Value *TsanInit = M.getOrInsertFunction("__tsan_init",
@@ -102,6 +110,7 @@ bool ThreadSanitizer::doInitialization(Module &M) {
 
 bool ThreadSanitizer::runOnFunction(Function &F) {
   if (!TD) return false;
+  if (BL->isIn(F)) return false;
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> LoadsAndStores;
   bool Res = false;
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index aad3a92..020ec57 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -579,6 +579,15 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
     return true;
   }
 
+  if (II && TLI) {
+    SmallVector<Value*, 2> PtrOps;
+    Type *AccessTy;
+    if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
+      while (!PtrOps.empty())
+        if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
+          return true;
+  }
+
   // From here on out we're working with named functions.
   if (CI->getCalledFunction() == 0) return false;
 
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index e275268..9b0aadb 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -28,6 +28,7 @@ STATISTIC(NumPhis,      "Number of phis propagated");
 STATISTIC(NumSelects,   "Number of selects propagated");
 STATISTIC(NumMemAccess, "Number of memory access targets propagated");
 STATISTIC(NumCmps,      "Number of comparisons propagated");
+STATISTIC(NumDeadCases, "Number of switch cases removed");
 
 namespace {
   class CorrelatedValuePropagation : public FunctionPass {
@@ -37,6 +38,7 @@ namespace {
     bool processPHI(PHINode *P);
     bool processMemAccess(Instruction *I);
     bool processCmp(CmpInst *C);
+    bool processSwitch(SwitchInst *SI);
 
   public:
     static char ID;
@@ -110,7 +112,8 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
     Changed = true;
   }
 
-  ++NumPhis;
+  if (Changed)
+    ++NumPhis;
 
   return Changed;
 }
@@ -173,6 +176,86 @@ bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
   return true;
 }
 
+/// processSwitch - Simplify a switch instruction by removing cases which can
+/// never fire.  If the uselessness of a case could be determined locally then
+/// constant propagation would already have figured it out.  Instead, walk the
+/// predecessors and statically evaluate cases based on information available
+/// on that edge.  Cases that cannot fire no matter what the incoming edge can
+/// safely be removed.  If a case fires on every incoming edge then the entire
+/// switch can be removed and replaced with a branch to the case destination.
+bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
+  Value *Cond = SI->getCondition();
+  BasicBlock *BB = SI->getParent();
+
+  // If the condition was defined in same block as the switch then LazyValueInfo
+  // currently won't say anything useful about it, though in theory it could.
+  if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
+    return false;
+
+  // If the switch is unreachable then trying to improve it is a waste of time.
+  pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+  if (PB == PE) return false;
+
+  // Analyse each switch case in turn.  This is done in reverse order so that
+  // removing a case doesn't cause trouble for the iteration.
+  bool Changed = false;
+  for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
+       ) {
+    ConstantInt *Case = CI.getCaseValue();
+
+    // Check to see if the switch condition is equal to/not equal to the case
+    // value on every incoming edge, equal/not equal being the same each time.
+    LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
+    for (pred_iterator PI = PB; PI != PE; ++PI) {
+      // Is the switch condition equal to the case value?
+      LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
+                                                              Cond, Case, *PI, BB);
+      // Give up on this case if nothing is known.
+      if (Value == LazyValueInfo::Unknown) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+
+      // If this was the first edge to be visited, record that all other edges
+      // need to give the same result.
+      if (PI == PB) {
+        State = Value;
+        continue;
+      }
+
+      // If this case is known to fire for some edges and known not to fire for
+      // others then there is nothing we can do - give up.
+      if (Value != State) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+    }
+
+    if (State == LazyValueInfo::False) {
+      // This case never fires - remove it.
+      CI.getCaseSuccessor()->removePredecessor(BB);
+      SI->removeCase(CI); // Does not invalidate the iterator.
+      ++NumDeadCases;
+      Changed = true;
+    } else if (State == LazyValueInfo::True) {
+      // This case always fires.  Arrange for the switch to be turned into an
+      // unconditional branch by replacing the switch condition with the case
+      // value.
+      SI->setCondition(Case);
+      NumDeadCases += SI->getNumCases();
+      Changed = true;
+      break;
+    }
+  }
+
+  if (Changed)
+    // If the switch has been simplified to the point where it can be replaced
+    // by a branch then do so now.
+    ConstantFoldTerminator(BB);
+
+  return Changed;
+}
+
 bool CorrelatedValuePropagation::runOnFunction(Function &F) {
   LVI = &getAnalysis<LazyValueInfo>();
 
@@ -200,6 +283,13 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
       }
     }
 
+    Instruction *Term = FI->getTerminator();
+    switch (Term->getOpcode()) {
+    case Instruction::Switch:
+      BBChanged |= processSwitch(cast<SwitchInst>(Term));
+      break;
+    }
+
     FnChanged |= BBChanged;
   }
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index fe05e35..ac80c48 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -2158,10 +2158,11 @@ bool GVN::processInstruction(Instruction *I) {
     Value *SwitchCond = SI->getCondition();
     BasicBlock *Parent = SI->getParent();
     bool Changed = false;
-    for (unsigned i = 0, e = SI->getNumCases(); i != e; ++i) {
-      BasicBlock *Dst = SI->getCaseSuccessor(i);
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock *Dst = i.getCaseSuccessor();
       if (isOnlyReachableViaThisEdge(Parent, Dst, DT))
-        Changed |= propagateEquality(SwitchCond, SI->getCaseValue(i), Dst);
+        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), Dst);
     }
     return Changed;
   }
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index d1e57e1..490617a 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -450,8 +450,10 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   }
 
   // Add a new IVUsers entry for the newly-created integer PHI.
-  if (IU)
-    IU->AddUsersIfInteresting(NewPHI);
+  if (IU) {
+    SmallPtrSet<Loop*, 16> SimplifiedLoopNests;
+    IU->AddUsersIfInteresting(NewPHI, SimplifiedLoopNests);
+  }
 
   Changed = true;
 }
@@ -1967,8 +1969,11 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // loop exit test instruction.
   if (IU && NewICmp) {
     ICmpInst *NewICmpInst = dyn_cast<ICmpInst>(NewICmp);
-    if (NewICmpInst)
-      IU->AddUsersIfInteresting(cast<Instruction>(NewICmpInst->getOperand(0)));
+    if (NewICmpInst) {
+      SmallPtrSet<Loop*, 16> SimplifiedLoopNests;
+      IU->AddUsersIfInteresting(cast<Instruction>(NewICmpInst->getOperand(0)),
+                                SimplifiedLoopNests);
+    }
   }
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader());
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index fa25a8f..429b61b 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -857,6 +857,9 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (BBIt != LoadBB->begin())
     return false;
 
+  // If all of the loads and stores that feed the value have the same TBAA tag,
+  // then we can propagate it onto any newly inserted loads.
+  MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); 
 
   SmallPtrSet<BasicBlock*, 8> PredsScanned;
   typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
@@ -875,11 +878,16 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
     // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
-    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6);
+    MDNode *ThisTBAATag = 0;
+    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6,
+                                                    0, &ThisTBAATag);
     if (!PredAvailable) {
       OneUnavailablePred = PredBB;
       continue;
     }
+    
+    // If tbaa tags disagree or are not present, forget about them.
+    if (TBAATag != ThisTBAATag) TBAATag = 0;
 
     // If so, this load is partially redundant.  Remember this info so that we
     // can create a PHI node.
@@ -939,6 +947,9 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
                                  LI->getAlignment(),
                                  UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LI->getDebugLoc());
+    if (TBAATag)
+      NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+    
     AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
   }
 
@@ -1087,8 +1098,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
     else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
       DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
     else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-      unsigned ValCase = SI->findCaseValue(cast<ConstantInt>(Val));
-      DestBB = SI->getSuccessor(SI->resolveSuccessorIndex(ValCase));
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
     } else {
       assert(isa<IndirectBrInst>(BB->getTerminator())
               && "Unexpected terminator");
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 6768860..82d918e 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -4534,22 +4534,25 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   if (!L->isLoopSimplifyForm())
     return;
 
+  // If there's no interesting work to be done, bail early.
+  if (IU.empty()) return;
+
+#ifndef NDEBUG
   // All dominating loops must have preheaders, or SCEVExpander may not be able
   // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
   //
-  // FIXME: This is a little absurd. I think LoopSimplify should be taught
-  // to create a preheader under any circumstance.
+  // IVUsers analysis should only create users that are dominated by simple loop
+  // headers. Since this loop should dominate all of its users, its user list
+  // should be empty if this loop itself is not within a simple loop nest.
   for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
        Rung; Rung = Rung->getIDom()) {
     BasicBlock *BB = Rung->getBlock();
     const Loop *DomLoop = LI.getLoopFor(BB);
     if (DomLoop && DomLoop->getHeader() == BB) {
-      if (!DomLoop->getLoopPreheader())
-        return;
+      assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
     }
   }
-  // If there's no interesting work to be done, bail early.
-  if (IU.empty()) return;
+#endif // DEBUG
 
   DEBUG(dbgs() << "\nLSR on loop ";
         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 2c75f63..053eb0c 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -32,7 +32,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
-#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -445,8 +445,9 @@ bool LoopUnswitch::processCurrentLoop() {
         // Do not process same value again and again.
         // At this point we have some cases already unswitched and
         // some not yet unswitched. Let's find the first not yet unswitched one.
-        for (unsigned i = 0; i < NumCases; ++i) {
-          Constant* UnswitchValCandidate = SI->getCaseValue(i);
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i) {
+          Constant* UnswitchValCandidate = i.getCaseValue();
           if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
             UnswitchVal = UnswitchValCandidate;
             break;
@@ -574,12 +575,13 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
     // this. 
     // Note that we can't trivially unswitch on the default case or
     // on already unswitched cases.
-    for (unsigned i = 0, e = SI->getNumCases(); i != e; ++i) {
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
       BasicBlock* LoopExitCandidate;
       if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, 
-                                               SI->getCaseSuccessor(i)))) {
+                                               i.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
-        ConstantInt* CaseVal = SI->getCaseValue(i);
+        ConstantInt* CaseVal = i.getCaseValue();
 
         // Check that it was not unswitched before, since already unswitched
         // trivial vals are looks trivial too.
@@ -1117,16 +1119,16 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     SwitchInst *SI = dyn_cast<SwitchInst>(U);
     if (SI == 0 || !isa<ConstantInt>(Val)) continue;
     
-    unsigned DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+    SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
     // Default case is live for multiple values.
-    if (DeadCase == SwitchInst::ErrorIndex) continue;
+    if (DeadCase == SI->case_default()) continue;
     
     // Found a dead case value.  Don't remove PHI nodes in the 
     // successor if they become single-entry, those PHI nodes may
     // be in the Users list.
 
     BasicBlock *Switch = SI->getParent();
-    BasicBlock *SISucc = SI->getCaseSuccessor(DeadCase);
+    BasicBlock *SISucc = DeadCase.getCaseSuccessor();
     BasicBlock *Latch = L->getLoopLatch();
     
     BranchesInfo.setUnswitched(SI, Val);
@@ -1146,7 +1148,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     // Compute the successors instead of relying on the return value
     // of SplitEdge, since it may have split the switch successor
     // after PHI nodes.
-    BasicBlock *NewSISucc = SI->getCaseSuccessor(DeadCase);
+    BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
     BasicBlock *OldSISucc = *succ_begin(NewSISucc);
     // Create an "unreachable" destination.
     BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index 1c7f036..9fdea8d 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -2929,11 +2929,17 @@ ComputePostOrders(Function &F,
   Visited.clear();
 
   // Compute the exits, which are the starting points for reverse-CFG DFS.
+  // This includes blocks where all the successors are backedges that
+  // we're skipping.
   SmallVector<BasicBlock *, 4> Exits;
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
     BasicBlock *BB = I;
-    if (cast<TerminatorInst>(&BB->back())->getNumSuccessors() == 0)
-      Exits.push_back(BB);
+    TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    for (succ_iterator SI(TI), SE(TI, true); SI != SE; ++SI)
+      if (!Backedges.count(std::make_pair(BB, *SI)))
+        goto HasNonBackedgeSucc;
+    Exits.push_back(BB);
+  HasNonBackedgeSucc:;
   }
 
   // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
@@ -3035,7 +3041,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
       // but our releases will never depend on it, because they must be
       // paired with retains from before the invoke.
       InsertPts[0] = II->getNormalDest()->getFirstInsertionPt();
-      InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt();
+      if (!II->getMetadata(NoObjCARCExceptionsMDKind))
+        InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt();
     } else {
       // Insert code immediately after the last use.
       InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse));
@@ -4017,36 +4024,40 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         Use &U = UI.getUse();
         unsigned OperandNo = UI.getOperandNo();
         ++UI; // Increment UI now, because we may unlink its element.
-        if (Instruction *UserInst = dyn_cast<Instruction>(U.getUser()))
-          if (Inst != UserInst && DT->dominates(Inst, UserInst)) {
-            Changed = true;
-            Instruction *Replacement = Inst;
-            Type *UseTy = U.get()->getType();
-            if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) {
-              // For PHI nodes, insert the bitcast in the predecessor block.
-              unsigned ValNo =
-                PHINode::getIncomingValueNumForOperand(OperandNo);
-              BasicBlock *BB =
-                PHI->getIncomingBlock(ValNo);
-              if (Replacement->getType() != UseTy)
-                Replacement = new BitCastInst(Replacement, UseTy, "",
-                                              &BB->back());
-              for (unsigned i = 0, e = PHI->getNumIncomingValues();
-                   i != e; ++i)
-                if (PHI->getIncomingBlock(i) == BB) {
-                  // Keep the UI iterator valid.
-                  if (&PHI->getOperandUse(
-                        PHINode::getOperandNumForIncomingValue(i)) ==
-                        &UI.getUse())
-                    ++UI;
-                  PHI->setIncomingValue(i, Replacement);
-                }
-            } else {
-              if (Replacement->getType() != UseTy)
-                Replacement = new BitCastInst(Replacement, UseTy, "", UserInst);
-              U.set(Replacement);
-            }
+        Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+        if (!UserInst)
+          continue;
+        // FIXME: dominates should return true for unreachable UserInst.
+        if (!DT->isReachableFromEntry(UserInst->getParent()) ||
+            DT->dominates(Inst, UserInst)) {
+          Changed = true;
+          Instruction *Replacement = Inst;
+          Type *UseTy = U.get()->getType();
+          if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) {
+            // For PHI nodes, insert the bitcast in the predecessor block.
+            unsigned ValNo =
+              PHINode::getIncomingValueNumForOperand(OperandNo);
+            BasicBlock *BB =
+              PHI->getIncomingBlock(ValNo);
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            &BB->back());
+            for (unsigned i = 0, e = PHI->getNumIncomingValues();
+                 i != e; ++i)
+              if (PHI->getIncomingBlock(i) == BB) {
+                // Keep the UI iterator valid.
+                if (&PHI->getOperandUse(
+                      PHINode::getOperandNumForIncomingValue(i)) ==
+                    &UI.getUse())
+                  ++UI;
+                PHI->setIncomingValue(i, Replacement);
+              }
+          } else {
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "", UserInst);
+            U.set(Replacement);
           }
+        }
       }
 
       // If Arg is a no-op casted pointer, strip one level of casts and
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 4274b50..5ce82b9 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -564,7 +564,7 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
       return;
     }
 
-    Succs[SI->resolveSuccessorIndex(SI->findCaseValue(CI))] = true;
+    Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
     return;
   }
 
@@ -623,14 +623,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     if (CI == 0)
       return !SCValue.isUndefined();
 
-    // Make sure to skip the "default value" which isn't a value
-    for (unsigned i = 0, E = SI->getNumCases(); i != E; ++i)
-      if (SI->getCaseValue(i) == CI) // Found the taken branch.
-        return SI->getCaseSuccessor(i) == To;
-
-    // If the constant value is not equal to any of the branches, we must
-    // execute default branch.
-    return SI->getDefaultDest() == To;
+    return SI->findCaseValue(CI).getCaseSuccessor() == To;
   }
 
   // Just mark all destinations executable!
@@ -1495,12 +1488,12 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // If the input to SCCP is actually switch on undef, fix the undef to
       // the first constant.
       if (isa<UndefValue>(SI->getCondition())) {
-        SI->setCondition(SI->getCaseValue(0));
-        markEdgeExecutable(BB, SI->getCaseSuccessor(0));
+        SI->setCondition(SI->case_begin().getCaseValue());
+        markEdgeExecutable(BB, SI->case_begin().getCaseSuccessor());
         return true;
       }
 
-      markForcedConstant(SI->getCondition(), SI->getCaseValue(0));
+      markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
       return true;
     }
   }
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index d23263f..d36a18f 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -574,8 +574,9 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     // transform it into a store of the expanded constant value.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
       assert(MSI->getRawDest() == Ptr && "Consistency error!");
-      unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
-      if (NumBytes != 0) {
+      signed SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
+      if (SNumBytes > 0) {
+        unsigned NumBytes = static_cast<unsigned>(SNumBytes);
         unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
 
         // Compute the value replicated the right number of times.
@@ -1517,6 +1518,9 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
       ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
       if (Length == 0)
         return MarkUnsafe(Info, User);
+      if (Length->isNegative())
+        return MarkUnsafe(Info, User);
+
       isSafeMemAccess(Offset, Length->getZExtValue(), 0,
                       UI.getOperandNo() == 0, Info, MI,
                       true /*AllowWholeAccess*/);
diff --git a/lib/Transforms/Utils/BasicInliner.cpp b/lib/Transforms/Utils/BasicInliner.cpp
deleted file mode 100644
index 50c91b6..0000000
--- a/lib/Transforms/Utils/BasicInliner.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===- BasicInliner.cpp - Basic function level inliner --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a simple function based inliner that does not use
-// call graph information. 
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "basicinliner"
-#include "llvm/Module.h"
-#include "llvm/Function.h"
-#include "llvm/Transforms/Utils/BasicInliner.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include <vector>
-
-using namespace llvm;
-
-static cl::opt<unsigned>     
-BasicInlineThreshold("basic-inline-threshold", cl::Hidden, cl::init(200),
-   cl::desc("Control the amount of basic inlining to perform (default = 200)"));
-
-namespace llvm {
-
-  /// BasicInlinerImpl - BasicInliner implemantation class. This hides
-  /// container info, used by basic inliner, from public interface.
-  struct BasicInlinerImpl {
-    
-    BasicInlinerImpl(const BasicInlinerImpl&); // DO NOT IMPLEMENT
-    void operator=(const BasicInlinerImpl&); // DO NO IMPLEMENT
-  public:
-    BasicInlinerImpl(TargetData *T) : TD(T) {}
-
-    /// addFunction - Add function into the list of functions to process.
-    /// All functions must be inserted using this interface before invoking
-    /// inlineFunctions().
-    void addFunction(Function *F) {
-      Functions.push_back(F);
-    }
-
-    /// neverInlineFunction - Sometimes a function is never to be inlined 
-    /// because of one or other reason. 
-    void neverInlineFunction(Function *F) {
-      NeverInline.insert(F);
-    }
-
-    /// inlineFuctions - Walk all call sites in all functions supplied by
-    /// client. Inline as many call sites as possible. Delete completely
-    /// inlined functions.
-    void inlineFunctions();
-    
-  private:
-    TargetData *TD;
-    std::vector<Function *> Functions;
-    SmallPtrSet<const Function *, 16> NeverInline;
-    SmallPtrSet<Function *, 8> DeadFunctions;
-    InlineCostAnalyzer CA;
-  };
-
-/// inlineFuctions - Walk all call sites in all functions supplied by
-/// client. Inline as many call sites as possible. Delete completely
-/// inlined functions.
-void BasicInlinerImpl::inlineFunctions() {
-      
-  // Scan through and identify all call sites ahead of time so that we only
-  // inline call sites in the original functions, not call sites that result
-  // from inlining other functions.
-  std::vector<CallSite> CallSites;
-  
-  for (std::vector<Function *>::iterator FI = Functions.begin(),
-         FE = Functions.end(); FI != FE; ++FI) {
-    Function *F = *FI;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
-        CallSite CS(cast<Value>(I));
-        if (CS && CS.getCalledFunction()
-            && !CS.getCalledFunction()->isDeclaration())
-          CallSites.push_back(CS);
-      }
-  }
-  
-  DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
-  
-  // Inline call sites.
-  bool Changed = false;
-  do {
-    Changed = false;
-    for (unsigned index = 0; index != CallSites.size() && !CallSites.empty(); 
-         ++index) {
-      CallSite CS = CallSites[index];
-      if (Function *Callee = CS.getCalledFunction()) {
-        
-        // Eliminate calls that are never inlinable.
-        if (Callee->isDeclaration() ||
-            CS.getInstruction()->getParent()->getParent() == Callee) {
-          CallSites.erase(CallSites.begin() + index);
-          --index;
-          continue;
-        }
-        InlineCost IC = CA.getInlineCost(CS, NeverInline);
-        if (IC.isAlways()) {        
-          DEBUG(dbgs() << "  Inlining: cost=always"
-                       <<", call: " << *CS.getInstruction());
-        } else if (IC.isNever()) {
-          DEBUG(dbgs() << "  NOT Inlining: cost=never"
-                       <<", call: " << *CS.getInstruction());
-          continue;
-        } else {
-          int Cost = IC.getValue();
-          
-          if (Cost >= (int) BasicInlineThreshold) {
-            DEBUG(dbgs() << "  NOT Inlining: cost = " << Cost
-                         << ", call: " <<  *CS.getInstruction());
-            continue;
-          } else {
-            DEBUG(dbgs() << "  Inlining: cost = " << Cost
-                         << ", call: " <<  *CS.getInstruction());
-          }
-        }
-        
-        // Inline
-        InlineFunctionInfo IFI(0, TD);
-        if (InlineFunction(CS, IFI)) {
-          Callee->removeDeadConstantUsers();
-          if (Callee->isDefTriviallyDead())
-            DeadFunctions.insert(Callee);
-          Changed = true;
-          CallSites.erase(CallSites.begin() + index);
-          --index;
-        }
-      }
-    }
-  } while (Changed);
-  
-  // Remove completely inlined functions from module.
-  for(SmallPtrSet<Function *, 8>::iterator I = DeadFunctions.begin(),
-        E = DeadFunctions.end(); I != E; ++I) {
-    Function *D = *I;
-    Module *M = D->getParent();
-    M->getFunctionList().remove(D);
-  }
-}
-
-BasicInliner::BasicInliner(TargetData *TD) {
-  Impl = new BasicInlinerImpl(TD);
-}
-
-BasicInliner::~BasicInliner() {
-  delete Impl;
-}
-
-/// addFunction - Add function into the list of functions to process.
-/// All functions must be inserted using this interface before invoking
-/// inlineFunctions().
-void BasicInliner::addFunction(Function *F) {
-  Impl->addFunction(F);
-}
-
-/// neverInlineFunction - Sometimes a function is never to be inlined because
-/// of one or other reason. 
-void BasicInliner::neverInlineFunction(Function *F) {
-  Impl->neverInlineFunction(F);
-}
-
-/// inlineFuctions - Walk all call sites in all functions supplied by
-/// client. Inline as many call sites as possible. Delete completely
-/// inlined functions.
-void BasicInliner::inlineFunctions() {
-  Impl->inlineFunctions();
-}
-
-}
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index d1aa599..7f5cb5e 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_llvm_library(LLVMTransformUtils
   AddrModeMatcher.cpp
   BasicBlockUtils.cpp
-  BasicInliner.cpp
   BreakCriticalEdges.cpp
   BuildLibCalls.cpp
   CloneFunction.cpp
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 04ef7d7..1b28c35 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -313,8 +313,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       Cond = dyn_cast_or_null<ConstantInt>(V);
     }
     if (Cond) {     // Constant fold to uncond branch!
-      unsigned CaseIndex = SI->findCaseValue(Cond);
-      BasicBlock *Dest = SI->getSuccessor(SI->resolveSuccessorIndex(CaseIndex));
+      SwitchInst::ConstCaseIt Case = SI->findCaseValue(Cond);
+      BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
       VMap[OldTI] = BranchInst::Create(Dest, NewBB);
       ToClone.push_back(Dest);
       TerminatorDone = true;
@@ -424,10 +424,6 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     //
     BasicBlock::iterator I = NewBB->begin();
 
-    DebugLoc TheCallDL;
-    if (TheCall) 
-      TheCallDL = TheCall->getDebugLoc();
-    
     // Handle PHI nodes specially, as we have to remove references to dead
     // blocks.
     if (PHINode *PN = dyn_cast<PHINode>(I)) {
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 429919b..e8c0b80 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -617,7 +617,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     // of the other successors.
     TheSwitch->setCondition(call);
     TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
-    TheSwitch->removeCase(NumExitBlocks-1);  // Remove redundant case
+    // Remove redundant case
+    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
     break;
   }
 }
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 336d8f6..5f895eb 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -106,31 +106,32 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
     // If we are switching on a constant, we can convert the switch into a
     // single branch instruction!
     ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition());
-    BasicBlock *TheOnlyDest = SI->getDefaultDest();  // The default dest
+    BasicBlock *TheOnlyDest = SI->getDefaultDest();
     BasicBlock *DefaultDest = TheOnlyDest;
 
     // Figure out which case it goes to.
-    for (unsigned i = 0, e = SI->getNumCases(); i != e; ++i) {
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
       // Found case matching a constant operand?
-      if (SI->getCaseValue(i) == CI) {
-        TheOnlyDest = SI->getCaseSuccessor(i);
+      if (i.getCaseValue() == CI) {
+        TheOnlyDest = i.getCaseSuccessor();
         break;
       }
 
       // Check to see if this branch is going to the same place as the default
       // dest.  If so, eliminate it as an explicit compare.
-      if (SI->getCaseSuccessor(i) == DefaultDest) {
+      if (i.getCaseSuccessor() == DefaultDest) {
         // Remove this entry.
         DefaultDest->removePredecessor(SI->getParent());
         SI->removeCase(i);
-        --i; --e;  // Don't skip an entry...
+        --i; --e;
         continue;
       }
 
       // Otherwise, check to see if the switch only branches to one destination.
       // We do this by reseting "TheOnlyDest" to null when we find two non-equal
       // destinations.
-      if (SI->getCaseSuccessor(i) != TheOnlyDest) TheOnlyDest = 0;
+      if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = 0;
     }
 
     if (CI && !TheOnlyDest) {
@@ -167,11 +168,13 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
     if (SI->getNumCases() == 1) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
+      SwitchInst::CaseIt FirstCase = SI->case_begin();
       Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
-                                         SI->getCaseValue(0), "cond");
+          FirstCase.getCaseValue(), "cond");
 
       // Insert the new branch.
-      Builder.CreateCondBr(Cond, SI->getCaseSuccessor(0), SI->getDefaultDest());
+      Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(),
+                           SI->getDefaultDest());
 
       // Delete the old switch.
       SI->eraseFromParent();
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 4376265..0bc185d 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -386,7 +386,7 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
                                          this);
   } else {
     SmallVector<BasicBlock*, 2> NewBBs;
-    SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader", 
+    SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader",
                                 ".split-lp", this, NewBBs);
     PreheaderBB = NewBBs[0];
   }
@@ -538,8 +538,7 @@ void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB,
 ///
 Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM,
                                        BasicBlock *Preheader) {
-  // Don't try to separate loops without a preheader (this excludes 
-  // loop headers which are targeted by an indirectbr).
+  // Don't try to separate loops without a preheader.
   if (!Preheader)
     return 0;
 
@@ -554,11 +553,15 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM,
   // handles the case when a PHI node has multiple instances of itself as
   // arguments.
   SmallVector<BasicBlock*, 8> OuterLoopPreds;
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     if (PN->getIncomingValue(i) != PN ||
-        !L->contains(PN->getIncomingBlock(i)))
+        !L->contains(PN->getIncomingBlock(i))) {
+      // We can't split indirectbr edges.
+      if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator()))
+        return 0;
       OuterLoopPreds.push_back(PN->getIncomingBlock(i));
-
+    }
+  }
   DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
 
   // If ScalarEvolution is around and knows anything about values in
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index df8d68e..c70ced1 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -73,16 +73,16 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) {
   LLVMContext &Context = CI->getContext();
   Type *Int32Ty = Type::getInt32Ty(Context);
 
-  unsigned caseNo = SI->findCaseValue(ExpectedValue);
+  SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue);
   std::vector<Value *> Vec;
   unsigned n = SI->getNumCases();
   Vec.resize(n + 1 + 1); // +1 for MDString and +1 for default case
 
   Vec[0] = MDString::get(Context, "branch_weights");
-  Vec[1] = ConstantInt::get(Int32Ty, SwitchInst::ErrorIndex == caseNo ?
+  Vec[1] = ConstantInt::get(Int32Ty, Case == SI->case_default() ?
                             LikelyBranchWeight : UnlikelyBranchWeight);
   for (unsigned i = 0; i < n; ++i) {
-    Vec[i + 1 + 1] = ConstantInt::get(Int32Ty, i == caseNo ?
+    Vec[i + 1 + 1] = ConstantInt::get(Int32Ty, i == Case.getCaseIndex() ?
         LikelyBranchWeight : UnlikelyBranchWeight);
   }
 
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 424f564..a16130d 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -237,10 +237,10 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
   unsigned numCmps = 0;
 
   // Start with "simple" cases
-  for (unsigned i = 0; i < SI->getNumCases(); ++i)
-    Cases.push_back(CaseRange(SI->getCaseValue(i),
-                              SI->getCaseValue(i),
-                              SI->getCaseSuccessor(i)));
+  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
+    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
+                              i.getCaseSuccessor()));
+  
   std::sort(Cases.begin(), Cases.end(), CaseCmp());
 
   // Merge case into clusters
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index a9853a4..d53a46e 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -480,9 +480,9 @@ GetValueEqualityComparisonCases(TerminatorInst *TI,
                                                       BasicBlock*> > &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
-    for (unsigned i = 0, e = SI->getNumCases(); i != e; ++i)
-      Cases.push_back(std::make_pair(SI->getCaseValue(i),
-                                     SI->getCaseSuccessor(i)));
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
+      Cases.push_back(std::make_pair(i.getCaseValue(),
+                                     i.getCaseSuccessor()));
     return SI->getDefaultDest();
   }
 
@@ -605,10 +605,10 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
                  << "Through successor TI: " << *TI);
 
-    for (unsigned i = SI->getNumCases(); i != 0;) {
+    for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
       --i;
-      if (DeadCases.count(SI->getCaseValue(i))) {
-        SI->getCaseSuccessor(i)->removePredecessor(TI->getParent());
+      if (DeadCases.count(i.getCaseValue())) {
+        i.getCaseSuccessor()->removePredecessor(TI->getParent());
         SI->removeCase(i);
       }
     }
@@ -2009,10 +2009,8 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
 
   // Find the relevant condition and destinations.
   Value *Condition = Select->getCondition();
-  unsigned TrueCase = SI->findCaseValue(TrueVal);
-  unsigned FalseCase = SI->findCaseValue(FalseVal);
-  BasicBlock *TrueBB = SI->getSuccessor(SI->resolveSuccessorIndex(TrueCase));
-  BasicBlock *FalseBB = SI->getSuccessor(SI->resolveSuccessorIndex(FalseCase));
+  BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor();
+  BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor();
 
   // Perform the actual simplification.
   return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB);
@@ -2096,7 +2094,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
   // Ok, the block is reachable from the default dest.  If the constant we're
   // comparing exists in one of the other edges, then we can constant fold ICI
   // and zap it.
-  if (SI->findCaseValue(Cst) != SwitchInst::ErrorIndex) {
+  if (SI->findCaseValue(Cst) != SI->case_default()) {
     Value *V;
     if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
       V = ConstantInt::getFalse(BB->getContext());
@@ -2423,8 +2421,9 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      for (unsigned i = 0, e = SI->getNumCases(); i != e; ++i)
-        if (SI->getCaseSuccessor(i) == BB) {
+      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+           i != e; ++i)
+        if (i.getCaseSuccessor() == BB) {
           BB->removePredecessor(SI->getParent());
           SI->removeCase(i);
           --i; --e;
@@ -2434,12 +2433,13 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
       // destination and make it the default.
       if (SI->getDefaultDest() == BB) {
         std::map<BasicBlock*, std::pair<unsigned, unsigned> > Popularity;
-        for (unsigned i = 0, e = SI->getNumCases(); i != e; ++i) {
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i) {
           std::pair<unsigned, unsigned> &entry =
-              Popularity[SI->getCaseSuccessor(i)];
+              Popularity[i.getCaseSuccessor()];
           if (entry.first == 0) {
             entry.first = 1;
-            entry.second = i;
+            entry.second = i.getCaseIndex();
           } else {
             entry.first++;
           }
@@ -2470,8 +2470,9 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
             for (unsigned i = 0; i != MaxPop-1; ++i)
               MaxBlock->removePredecessor(SI->getParent());
           
-          for (unsigned i = 0, e = SI->getNumCases(); i != e; ++i)
-            if (SI->getCaseSuccessor(i) == MaxBlock) {
+          for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+               i != e; ++i)
+            if (i.getCaseSuccessor() == MaxBlock) {
               SI->removeCase(i);
               --i; --e;
             }
@@ -2517,11 +2518,13 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
 
   // Make sure all cases point to the same destination and gather the values.
   SmallVector<ConstantInt *, 16> Cases;
-  Cases.push_back(SI->getCaseValue(0));
-  for (unsigned I = 1, E = SI->getNumCases(); I != E; ++I) {
-    if (SI->getCaseSuccessor(I-1) != SI->getCaseSuccessor(I))
+  SwitchInst::CaseIt I = SI->case_begin();
+  Cases.push_back(I.getCaseValue());
+  SwitchInst::CaseIt PrevI = I++;
+  for (SwitchInst::CaseIt E = SI->case_end(); I != E; PrevI = I++) {
+    if (PrevI.getCaseSuccessor() != I.getCaseSuccessor())
       return false;
-    Cases.push_back(SI->getCaseValue(I));
+    Cases.push_back(I.getCaseValue());
   }
   assert(Cases.size() == SI->getNumCases() && "Not all cases gathered");
 
@@ -2539,10 +2542,11 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   if (!Offset->isNullValue())
     Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
   Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
-  Builder.CreateCondBr(Cmp, SI->getCaseSuccessor(0), SI->getDefaultDest());
+  Builder.CreateCondBr(
+      Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest());
 
   // Prune obsolete incoming values off the successor's PHI nodes.
-  for (BasicBlock::iterator BBI = SI->getCaseSuccessor(0)->begin();
+  for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin();
        isa<PHINode>(BBI); ++BBI) {
     for (unsigned I = 0, E = SI->getNumCases()-1; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
@@ -2562,22 +2566,22 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) {
 
   // Gather dead cases.
   SmallVector<ConstantInt*, 8> DeadCases;
-  for (unsigned I = 0, E = SI->getNumCases(); I != E; ++I) {
-    if ((SI->getCaseValue(I)->getValue() & KnownZero) != 0 ||
-        (SI->getCaseValue(I)->getValue() & KnownOne) != KnownOne) {
-      DeadCases.push_back(SI->getCaseValue(I));
+  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
+    if ((I.getCaseValue()->getValue() & KnownZero) != 0 ||
+        (I.getCaseValue()->getValue() & KnownOne) != KnownOne) {
+      DeadCases.push_back(I.getCaseValue());
       DEBUG(dbgs() << "SimplifyCFG: switch case '"
-                   << SI->getCaseValue(I)->getValue() << "' is dead.\n");
+                   << I.getCaseValue() << "' is dead.\n");
     }
   }
 
   // Remove dead cases from the switch.
   for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
-    unsigned Case = SI->findCaseValue(DeadCases[I]);
-    assert(Case != SwitchInst::ErrorIndex &&
+    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCases[I]);
+    assert(Case != SI->case_default() &&
            "Case was not found. Probably mistake in DeadCases forming.");
     // Prune unused values from PHI nodes.
-    SI->getCaseSuccessor(Case)->removePredecessor(SI->getParent());
+    Case.getCaseSuccessor()->removePredecessor(SI->getParent());
     SI->removeCase(Case);
   }
 
@@ -2626,9 +2630,9 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   typedef DenseMap<PHINode*, SmallVector<int,4> > ForwardingNodesMap;
   ForwardingNodesMap ForwardingNodes;
 
-  for (unsigned I = 0; I < SI->getNumCases(); ++I) { // 0 is the default case.
-    ConstantInt *CaseValue = SI->getCaseValue(I);
-    BasicBlock *CaseDest = SI->getCaseSuccessor(I);
+  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
+    ConstantInt *CaseValue = I.getCaseValue();
+    BasicBlock *CaseDest = I.getCaseSuccessor();
 
     int PhiIndex;
     PHINode *PHI = FindPHIForConditionForwarding(CaseValue, CaseDest,
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 20eef3c..e00565d 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -231,8 +231,10 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
 
   // Inform IVUsers about the new users.
   if (IU) {
-    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
-      IU->AddUsersIfInteresting(I);
+    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) {
+      SmallPtrSet<Loop*, 16> SimplifiedLoopNests;
+      IU->AddUsersIfInteresting(I, SimplifiedLoopNests);
+    }
   }
   DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
   ++NumElimRem;
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 6874906..7b39efb 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -1731,12 +1731,12 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ", ";
     writeOperand(SI.getDefaultDest(), true);
     Out << " [";
-    unsigned NumCases = SI.getNumCases();
-    for (unsigned i = 0; i < NumCases; ++i) {
+    for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
+         i != e; ++i) {
       Out << "\n    ";
-      writeOperand(SI.getCaseValue(i), true);
+      writeOperand(i.getCaseValue(), true);
       Out << ", ";
-      writeOperand(SI.getCaseSuccessor(i), true);
+      writeOperand(i.getCaseSuccessor(), true);
     }
     Out << "\n  ]";
   } else if (isa<IndirectBrInst>(I)) {
diff --git a/lib/VMCore/ConstantsContext.h b/lib/VMCore/ConstantsContext.h
index a7277f6..8903a8f 100644
--- a/lib/VMCore/ConstantsContext.h
+++ b/lib/VMCore/ConstantsContext.h
@@ -16,6 +16,7 @@
 #define LLVM_CONSTANTSCONTEXT_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
 #include "llvm/Operator.h"
@@ -656,48 +657,18 @@ private:
       return ConstantClassInfo::getTombstoneKey();
     }
     static unsigned getHashValue(const ConstantClass *CP) {
-      // This is adapted from SuperFastHash by Paul Hsieh.
-      unsigned Hash = TypeClassInfo::getHashValue(CP->getType());
-      for (unsigned I = 0, E = CP->getNumOperands(); I < E; ++I) {
-        unsigned Data = ConstantInfo::getHashValue(CP->getOperand(I));
-        Hash         += Data & 0xFFFF;
-        unsigned Tmp  = ((Data >> 16) << 11) ^ Hash;
-        Hash          = (Hash << 16) ^ Tmp;
-        Hash         += Hash >> 11;
-      }
-
-      // Force "avalanching" of final 127 bits.
-      Hash ^= Hash << 3;
-      Hash += Hash >> 5;
-      Hash ^= Hash << 4;
-      Hash += Hash >> 17;
-      Hash ^= Hash << 25;
-      Hash += Hash >> 6;
-      return Hash;
+      SmallVector<Constant*, 8> CPOperands;
+      CPOperands.reserve(CP->getNumOperands());
+      for (unsigned I = 0, E = CP->getNumOperands(); I < E; ++I)
+        CPOperands.push_back(CP->getOperand(I));
+      return getHashValue(LookupKey(CP->getType(), CPOperands));
     }
     static bool isEqual(const ConstantClass *LHS, const ConstantClass *RHS) {
       return LHS == RHS;
     }
     static unsigned getHashValue(const LookupKey &Val) {
-      // This is adapted from SuperFastHash by Paul Hsieh.
-      unsigned Hash = TypeClassInfo::getHashValue(Val.first);
-      for (Operands::const_iterator
-        I = Val.second.begin(), E = Val.second.end(); I != E; ++I) {
-        unsigned Data = ConstantInfo::getHashValue(*I);
-        Hash         += Data & 0xFFFF;
-        unsigned Tmp  = ((Data >> 16) << 11) ^ Hash;
-        Hash          = (Hash << 16) ^ Tmp;
-        Hash         += Hash >> 11;
-      }
-
-      // Force "avalanching" of final 127 bits.
-      Hash ^= Hash << 3;
-      Hash += Hash >> 5;
-      Hash ^= Hash << 4;
-      Hash += Hash >> 17;
-      Hash ^= Hash << 25;
-      Hash += Hash >> 6;
-      return Hash;
+      return hash_combine(Val.first, hash_combine_range(Val.second.begin(),
+                                                        Val.second.end()));
     }
     static bool isEqual(const LookupKey &LHS, const ConstantClass *RHS) {
       if (RHS == getEmptyKey() || RHS == getTombstoneKey())
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 11fd5b6..8db6ac9 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -3159,13 +3159,16 @@ void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
   // Initialize some new operands.
   assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
   NumOperands = OpNo+2;
-  setCaseValue(NewCaseIdx, OnVal);
-  setCaseSuccessor(NewCaseIdx, Dest);
+  CaseIt Case(this, NewCaseIdx);
+  Case.setValue(OnVal);
+  Case.setSuccessor(Dest);
 }
 
 /// removeCase - This method removes the specified case and its successor
 /// from the switch instruction.
-void SwitchInst::removeCase(unsigned idx) {
+void SwitchInst::removeCase(CaseIt i) {
+  unsigned idx = i.getCaseIndex();
+  
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
 
   unsigned NumOps = getNumOperands();
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index 0bc4f74..c6f3558 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -185,16 +185,7 @@ bool Type::isSizedDerivedType() const {
   if (!this->isStructTy()) 
     return false;
 
-  // Opaque structs have no size.
-  if (cast<StructType>(this)->isOpaque())
-    return false;
-  
-  // Okay, our struct is sized if all of the elements are.
-  for (subtype_iterator I = subtype_begin(), E = subtype_end(); I != E; ++I)
-    if (!(*I)->isSized()) 
-      return false;
-
-  return true;
+  return cast<StructType>(this)->isSized();
 }
 
 //===----------------------------------------------------------------------===//
@@ -579,6 +570,26 @@ StructType *StructType::create(StringRef Name, Type *type, ...) {
   return llvm::StructType::create(Ctx, StructFields, Name);
 }
 
+bool StructType::isSized() const {
+  if ((getSubclassData() & SCDB_IsSized) != 0)
+    return true;
+  if (isOpaque())
+    return false;
+
+  // Okay, our struct is sized if all of the elements are, but if one of the
+  // elements is opaque, the struct isn't sized *yet*, but may become sized in
+  // the future, so just bail out without caching.
+  for (element_iterator I = element_begin(), E = element_end(); I != E; ++I)
+    if (!(*I)->isSized())
+      return false;
+
+  // Here we cheat a bit and cast away const-ness. The goal is to memoize when
+  // we find a sized type, as types can only move from opaque to sized, not the
+  // other way.
+  const_cast<StructType*>(this)->setSubclassData(
+    getSubclassData() | SCDB_IsSized);
+  return true;
+}
 
 StringRef StructType::getName() const {
   assert(!isLiteral() && "Literal structs never have names");
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index 207c06d..41cc38c 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -317,20 +317,40 @@ void Value::replaceAllUsesWith(Value *New) {
     BB->replaceSuccessorsPhiUsesWith(cast<BasicBlock>(New));
 }
 
-Value *Value::stripPointerCasts() {
-  if (!getType()->isPointerTy())
-    return this;
+namespace {
+// Various metrics for how much to strip off of pointers.
+enum PointerStripKind {
+  PSK_ZeroIndices,
+  PSK_InBoundsConstantIndices,
+  PSK_InBounds
+};
+
+template <PointerStripKind StripKind>
+static Value *stripPointerCastsAndOffsets(Value *V) {
+  if (!V->getType()->isPointerTy())
+    return V;
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
 
-  Value *V = this;
   Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-      if (!GEP->hasAllZeroIndices())
-        return V;
+      switch (StripKind) {
+      case PSK_ZeroIndices:
+        if (!GEP->hasAllZeroIndices())
+          return V;
+        break;
+      case PSK_InBoundsConstantIndices:
+        if (!GEP->hasAllConstantIndices())
+          return V;
+        // fallthrough
+      case PSK_InBounds:
+        if (!GEP->isInBounds())
+          return V;
+        break;
+      }
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
@@ -346,6 +366,19 @@ Value *Value::stripPointerCasts() {
 
   return V;
 }
+} // namespace
+
+Value *Value::stripPointerCasts() {
+  return stripPointerCastsAndOffsets<PSK_ZeroIndices>(this);
+}
+
+Value *Value::stripInBoundsConstantOffsets() {
+  return stripPointerCastsAndOffsets<PSK_InBoundsConstantIndices>(this);
+}
+
+Value *Value::stripInBoundsOffsets() {
+  return stripPointerCastsAndOffsets<PSK_InBounds>(this);
+}
 
 /// isDereferenceablePointer - Test if this value is always a pointer to
 /// allocated and suitably aligned memory for a simple load or store.
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index dcf86d2..5b9b2a5 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -813,11 +813,11 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
   SmallPtrSet<ConstantInt*, 32> Constants;
-  for (unsigned i = 0, e = SI.getNumCases(); i != e; ++i) {
-    Assert1(SI.getCaseValue(i)->getType() == SwitchTy,
+  for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
+    Assert1(i.getCaseValue()->getType() == SwitchTy,
             "Switch constants must all be same type as switch value!", &SI);
-    Assert2(Constants.insert(SI.getCaseValue(i)),
-            "Duplicate integer as switch case", &SI, SI.getCaseValue(i));
+    Assert2(Constants.insert(i.getCaseValue()),
+            "Duplicate integer as switch case", &SI, i.getCaseValue());
   }
 
   visitTerminatorInst(SI);