Merge with LLVM upstream r155090.

Conflicts: lib/Support/Unix/PathV2.inc Change-Id: I7b89833849f6cbcfa958a33a971d0f7754c9cb2c
author: Shih-wei Liao <sliao@google.com> 2012-04-24 11:26:46 -0700
committer: Shih-wei Liao <sliao@google.com> 2012-04-24 11:26:46 -0700
commit: cf5a1461acaace0f3e7d11fbbcfbf635b8c8ea9d (patch)
tree: 557137810ae9efc96147d672d372e4dabd0a2440 /lib
parent: 4c8fab82874a29dcd2b242533af3ebe7f66bfd74 (diff)
parent: fc728fbdc2631ce8f343cf9b7292d218fde7419f (diff)
download: external_llvm-cf5a1461acaace0f3e7d11fbbcfbf635b8c8ea9d.zip
external_llvm-cf5a1461acaace0f3e7d11fbbcfbf635b8c8ea9d.tar.gz
external_llvm-cf5a1461acaace0f3e7d11fbbcfbf635b8c8ea9d.tar.bz2
389 files changed, 14072 insertions, 10824 deletions
diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp
index 6c93f78..316e7bc 100644
--- a/lib/Analysis/CodeMetrics.cpp
+++ b/lib/Analysis/CodeMetrics.cpp
@@ -50,6 +50,52 @@ bool llvm::callIsSmall(const Function *F) {
   return false;
 }
 
+bool llvm::isInstructionFree(const Instruction *I, const TargetData *TD) {
+  if (isa<PHINode>(I))
+    return true;
+
+  // If a GEP has all constant indices, it will probably be folded with
+  // a load/store.
+  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+    return GEP->hasAllConstantIndices();
+
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::objectsize:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+      // These intrinsics don't count as size.
+      return true;
+    }
+  }
+
+  if (const CastInst *CI = dyn_cast<CastInst>(I)) {
+    // Noop casts, including ptr <-> int,  don't count.
+    if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) || isa<PtrToIntInst>(CI))
+      return true;
+    // trunc to a native type is free (assuming the target has compare and
+    // shift-right of the same width).
+    if (TD && isa<TruncInst>(CI) &&
+        TD->isLegalInteger(TD->getTypeSizeInBits(CI->getType())))
+      return true;
+    // Result of a cmp instruction is often extended (to be used by other
+    // cmp instructions, logical or return instructions). These are usually
+    // nop on most sane targets.
+    if (isa<CmpInst>(CI->getOperand(0)))
+      return true;
+  }
+
+  return false;
+}
+
 /// analyzeBasicBlock - Fill in the current structure with information gleaned
 /// from the specified block.
 void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
@@ -58,27 +104,11 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
   unsigned NumInstsBeforeThisBB = NumInsts;
   for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
        II != E; ++II) {
-    if (isa<PHINode>(II)) continue;           // PHI nodes don't count.
+    if (isInstructionFree(II, TD))
+      continue;
 
     // Special handling for calls.
     if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
-      if (const IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(II)) {
-        switch (IntrinsicI->getIntrinsicID()) {
-        default: break;
-        case Intrinsic::dbg_declare:
-        case Intrinsic::dbg_value:
-        case Intrinsic::invariant_start:
-        case Intrinsic::invariant_end:
-        case Intrinsic::lifetime_start:
-        case Intrinsic::lifetime_end:
-        case Intrinsic::objectsize:
-        case Intrinsic::ptr_annotation:
-        case Intrinsic::var_annotation:
-          // These intrinsics don't count as size.
-          continue;
-        }
-      }
-
       ImmutableCallSite CS(cast<Instruction>(II));
 
       if (const Function *F = CS.getCalledFunction()) {
@@ -115,28 +145,6 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
     if (isa<ExtractElementInst>(II) || II->getType()->isVectorTy())
       ++NumVectorInsts;
 
-    if (const CastInst *CI = dyn_cast<CastInst>(II)) {
-      // Noop casts, including ptr <-> int,  don't count.
-      if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) ||
-          isa<PtrToIntInst>(CI))
-        continue;
-      // trunc to a native type is free (assuming the target has compare and
-      // shift-right of the same width).
-      if (isa<TruncInst>(CI) && TD &&
-          TD->isLegalInteger(TD->getTypeSizeInBits(CI->getType())))
-        continue;
-      // Result of a cmp instruction is often extended (to be used by other
-      // cmp instructions, logical or return instructions). These are usually
-      // nop on most sane targets.
-      if (isa<CmpInst>(CI->getOperand(0)))
-        continue;
-    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(II)){
-      // If a GEP has all constant indices, it will probably be folded with
-      // a load/store.
-      if (GEPI->hasAllConstantIndices())
-        continue;
-    }
-
     ++NumInsts;
   }
 
diff --git a/lib/Analysis/DIBuilder.cpp b/lib/Analysis/DIBuilder.cpp
index f0bdc48..85913b1 100644
--- a/lib/Analysis/DIBuilder.cpp
+++ b/lib/Analysis/DIBuilder.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 
 using namespace llvm;
@@ -385,16 +386,21 @@ DIType DIBuilder::createObjCIVar(StringRef Name,
 
 /// createObjCProperty - Create debugging information entry for Objective-C
 /// property.
-DIObjCProperty DIBuilder::createObjCProperty(StringRef Name, 
+DIObjCProperty DIBuilder::createObjCProperty(StringRef Name,
+					     DIFile File, unsigned LineNumber,
                                              StringRef GetterName,
                                              StringRef SetterName, 
-                                             unsigned PropertyAttributes) {
+                                             unsigned PropertyAttributes,
+					     DIType Ty) {
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_Property),
+    GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_property),
     MDString::get(VMContext, Name),
+    File,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     MDString::get(VMContext, GetterName),
     MDString::get(VMContext, SetterName),
-    ConstantInt::get(Type::getInt32Ty(VMContext), PropertyAttributes)
+    ConstantInt::get(Type::getInt32Ty(VMContext), PropertyAttributes),
+    Ty
   };
   return DIObjCProperty(MDNode::get(VMContext, Elts));
 }
@@ -820,6 +826,7 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
                                        DIFile File, unsigned LineNo,
                                        DIType Ty,
                                        bool isLocalToUnit, bool isDefinition,
+                                       unsigned ScopeLine,
                                        unsigned Flags, bool isOptimized,
                                        Function *Fn,
                                        MDNode *TParams,
@@ -849,7 +856,8 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
     Fn,
     TParams,
     Decl,
-    THolder
+    THolder,
+    ConstantInt::get(Type::getInt32Ty(VMContext), ScopeLine)
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
 
@@ -897,7 +905,9 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
     Fn,
     TParam,
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    THolder
+    THolder,
+    // FIXME: Do we want to use a different scope lines?
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
   return DISubprogram(Node);
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/Analysis/DebugInfo.cpp
index e30c0a9..f61a8f3 100644
--- a/lib/Analysis/DebugInfo.cpp
+++ b/lib/Analysis/DebugInfo.cpp
@@ -291,7 +291,7 @@ bool DIDescriptor::isEnumerator() const {
 
 /// isObjCProperty - Return true if the specified tag is DW_TAG
 bool DIDescriptor::isObjCProperty() const {
-  return DbgNode && getTag() == dwarf::DW_TAG_APPLE_Property;
+  return DbgNode && getTag() == dwarf::DW_TAG_APPLE_property;
 }
 //===----------------------------------------------------------------------===//
 // Simple Descriptor Constructors and other Methods
@@ -377,6 +377,19 @@ bool DICompileUnit::Verify() const {
   return true;
 }
 
+/// Verify - Verify that an ObjC property is well formed.
+bool DIObjCProperty::Verify() const {
+  if (!DbgNode)
+    return false;
+  unsigned Tag = getTag();
+  if (Tag != dwarf::DW_TAG_APPLE_property) return false;
+  DIType Ty = getType();
+  if (!Ty.Verify()) return false;
+
+  // Don't worry about the rest of the strings for now.
+  return true;
+}
+
 /// Verify - Verify that a type descriptor is well formed.
 bool DIType::Verify() const {
   if (!DbgNode)
@@ -774,6 +787,9 @@ void DISubprogram::print(raw_ostream &OS) const {
   if (isDefinition())
     OS << " [def] ";
 
+  if (getScopeLineNumber() != getLineNumber())
+    OS << " [Scope: " << getScopeLineNumber() << "] ";
+
   OS << "\n";
 }
 
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index b226d66..c1d8e3e 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -467,6 +468,11 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
         } else if (isMalloc(&cast<Instruction>(*II)) ||
                    isFreeCall(&cast<Instruction>(*II))) {
           FunctionEffect |= ModRef;
+        } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) {
+          // The callgraph doesn't include intrinsic calls.
+          Function *Callee = Intrinsic->getCalledFunction();
+          ModRefBehavior Behaviour = AliasAnalysis::getModRefBehavior(Callee);
+          FunctionEffect |= (Behaviour & ModRef);
         }
 
     if ((FunctionEffect & Mod) == 0)
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 463584d..b80966b 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -107,11 +107,11 @@ static bool isSimplifiedLoopNest(BasicBlock *BB, const DominatorTree *DT,
   return true;
 }
 
-/// AddUsersIfInteresting - Inspect the specified instruction.  If it is a
+/// AddUsersImpl - Inspect the specified instruction.  If it is a
 /// reducible SCEV, recursively add its users to the IVUsesByStride set and
 /// return true.  Otherwise, return false.
-bool IVUsers::AddUsersIfInteresting(Instruction *I,
-                                    SmallPtrSet<Loop*,16> &SimpleLoopNests) {
+bool IVUsers::AddUsersImpl(Instruction *I,
+                           SmallPtrSet<Loop*,16> &SimpleLoopNests) {
   // Add this IV user to the Processed set before returning false to ensure that
   // all IV users are members of the set. See IVUsers::isIVUserOrOperand.
   if (!Processed.insert(I))
@@ -167,13 +167,12 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I,
     bool AddUserToIVUsers = false;
     if (LI->getLoopFor(User->getParent()) != L) {
       if (isa<PHINode>(User) || Processed.count(User) ||
-          !AddUsersIfInteresting(User, SimpleLoopNests)) {
+          !AddUsersImpl(User, SimpleLoopNests)) {
         DEBUG(dbgs() << "FOUND USER in other loop: " << *User << '\n'
                      << "   OF SCEV: " << *ISE << '\n');
         AddUserToIVUsers = true;
       }
-    } else if (Processed.count(User)
-               || !AddUsersIfInteresting(User, SimpleLoopNests)) {
+    } else if (Processed.count(User) || !AddUsersImpl(User, SimpleLoopNests)) {
       DEBUG(dbgs() << "FOUND USER: " << *User << '\n'
                    << "   OF SCEV: " << *ISE << '\n');
       AddUserToIVUsers = true;
@@ -197,6 +196,15 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I,
   return true;
 }
 
+bool IVUsers::AddUsersIfInteresting(Instruction *I) {
+  // SCEVExpander can only handle users that are dominated by simplified loop
+  // entries. Keep track of all loops that are only dominated by other simple
+  // loops so we don't traverse the domtree for each user.
+  SmallPtrSet<Loop*,16> SimpleLoopNests;
+
+  return AddUsersImpl(I, SimpleLoopNests);
+}
+
 IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) {
   IVUses.push_back(new IVStrideUse(this, User, Operand));
   return IVUses.back();
@@ -222,16 +230,11 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
   SE = &getAnalysis<ScalarEvolution>();
   TD = getAnalysisIfAvailable<TargetData>();
 
-  // SCEVExpander can only handle users that are dominated by simplified loop
-  // entries. Keep track of all loops that are only dominated by other simple
-  // loops so we don't traverse the domtree for each user.
-  SmallPtrSet<Loop*,16> SimpleLoopNests;
-
   // Find all uses of induction variables in this loop, and categorize
   // them by stride.  Start by finding all of the PHI nodes in the header for
   // this loop.  If they are induction variables, inspect their uses.
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
-    (void)AddUsersIfInteresting(I, SimpleLoopNests);
+    (void)AddUsersIfInteresting(I);
 
   return false;
 }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index dedbfeb..3e3d2ab 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -11,659 +11,1012 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "inline-cost"
 #include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/CallingConv.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Operator.h"
+#include "llvm/GlobalAlias.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 
 using namespace llvm;
 
-unsigned InlineCostAnalyzer::FunctionInfo::countCodeReductionForConstant(
-    const CodeMetrics &Metrics, Value *V) {
-  unsigned Reduction = 0;
-  SmallVector<Value *, 4> Worklist;
-  Worklist.push_back(V);
-  do {
-    Value *V = Worklist.pop_back_val();
-    for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
-      User *U = *UI;
-      if (isa<BranchInst>(U) || isa<SwitchInst>(U)) {
-        // We will be able to eliminate all but one of the successors.
-        const TerminatorInst &TI = cast<TerminatorInst>(*U);
-        const unsigned NumSucc = TI.getNumSuccessors();
-        unsigned Instrs = 0;
-        for (unsigned I = 0; I != NumSucc; ++I)
-          Instrs += Metrics.NumBBInsts.lookup(TI.getSuccessor(I));
-        // We don't know which blocks will be eliminated, so use the average size.
-        Reduction += InlineConstants::InstrCost*Instrs*(NumSucc-1)/NumSucc;
-        continue;
-      }
+STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
+
+namespace {
+
+class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
+  typedef InstVisitor<CallAnalyzer, bool> Base;
+  friend class InstVisitor<CallAnalyzer, bool>;
+
+  // TargetData if available, or null.
+  const TargetData *const TD;
+
+  // The called function.
+  Function &F;
+
+  int Threshold;
+  int Cost;
+  const bool AlwaysInline;
+
+  bool IsRecursive;
+  bool ExposesReturnsTwice;
+  bool HasDynamicAlloca;
+  unsigned NumInstructions, NumVectorInstructions;
+  int FiftyPercentVectorBonus, TenPercentVectorBonus;
+  int VectorBonus;
+
+  // While we walk the potentially-inlined instructions, we build up and
+  // maintain a mapping of simplified values specific to this callsite. The
+  // idea is to propagate any special information we have about arguments to
+  // this call through the inlinable section of the function, and account for
+  // likely simplifications post-inlining. The most important aspect we track
+  // is CFG altering simplifications -- when we prove a basic block dead, that
+  // can cause dramatic shifts in the cost of inlining a function.
+  DenseMap<Value *, Constant *> SimplifiedValues;
+
+  // Keep track of the values which map back (through function arguments) to
+  // allocas on the caller stack which could be simplified through SROA.
+  DenseMap<Value *, Value *> SROAArgValues;
+
+  // The mapping of caller Alloca values to their accumulated cost savings. If
+  // we have to disable SROA for one of the allocas, this tells us how much
+  // cost must be added.
+  DenseMap<Value *, int> SROAArgCosts;
+
+  // Keep track of values which map to a pointer base and constant offset.
+  DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs;
+
+  // Custom simplification helper routines.
+  bool isAllocaDerivedArg(Value *V);
+  bool lookupSROAArgAndCost(Value *V, Value *&Arg,
+                            DenseMap<Value *, int>::iterator &CostIt);
+  void disableSROA(DenseMap<Value *, int>::iterator CostIt);
+  void disableSROA(Value *V);
+  void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+                          int InstructionCost);
+  bool handleSROACandidate(bool IsSROAValid,
+                           DenseMap<Value *, int>::iterator CostIt,
+                           int InstructionCost);
+  bool isGEPOffsetConstant(GetElementPtrInst &GEP);
+  bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
+  ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
+
+  // Custom analysis routines.
+  bool analyzeBlock(BasicBlock *BB);
+
+  // Disable several entry points to the visitor so we don't accidentally use
+  // them by declaring but not defining them here.
+  void visit(Module *);     void visit(Module &);
+  void visit(Function *);   void visit(Function &);
+  void visit(BasicBlock *); void visit(BasicBlock &);
+
+  // Provide base case for our instruction visit.
+  bool visitInstruction(Instruction &I);
+
+  // Our visit overrides.
+  bool visitAlloca(AllocaInst &I);
+  bool visitPHI(PHINode &I);
+  bool visitGetElementPtr(GetElementPtrInst &I);
+  bool visitBitCast(BitCastInst &I);
+  bool visitPtrToInt(PtrToIntInst &I);
+  bool visitIntToPtr(IntToPtrInst &I);
+  bool visitCastInst(CastInst &I);
+  bool visitUnaryInstruction(UnaryInstruction &I);
+  bool visitICmp(ICmpInst &I);
+  bool visitSub(BinaryOperator &I);
+  bool visitBinaryOperator(BinaryOperator &I);
+  bool visitLoad(LoadInst &I);
+  bool visitStore(StoreInst &I);
+  bool visitCallSite(CallSite CS);
+
+public:
+  CallAnalyzer(const TargetData *TD, Function &Callee, int Threshold)
+    : TD(TD), F(Callee), Threshold(Threshold), Cost(0),
+      AlwaysInline(F.hasFnAttr(Attribute::AlwaysInline)),
+      IsRecursive(false), ExposesReturnsTwice(false), HasDynamicAlloca(false),
+      NumInstructions(0), NumVectorInstructions(0),
+      FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
+      NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
+      NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+      NumInstructionsSimplified(0), SROACostSavings(0), SROACostSavingsLost(0) {
+  }
 
-      // Figure out if this instruction will be removed due to simple constant
-      // propagation.
-      Instruction &Inst = cast<Instruction>(*U);
-
-      // We can't constant propagate instructions which have effects or
-      // read memory.
-      //
-      // FIXME: It would be nice to capture the fact that a load from a
-      // pointer-to-constant-global is actually a *really* good thing to zap.
-      // Unfortunately, we don't know the pointer that may get propagated here,
-      // so we can't make this decision.
-      if (Inst.mayReadFromMemory() || Inst.mayHaveSideEffects() ||
-          isa<AllocaInst>(Inst))
-        continue;
+  bool analyzeCall(CallSite CS);
 
-      bool AllOperandsConstant = true;
-      for (unsigned i = 0, e = Inst.getNumOperands(); i != e; ++i)
-        if (!isa<Constant>(Inst.getOperand(i)) && Inst.getOperand(i) != V) {
-          AllOperandsConstant = false;
-          break;
-        }
-      if (!AllOperandsConstant)
-        continue;
+  int getThreshold() { return Threshold; }
+  int getCost() { return Cost; }
 
-      // We will get to remove this instruction...
-      Reduction += InlineConstants::InstrCost;
+  // Keep a bunch of stats about the cost savings found so we can print them
+  // out when debugging.
+  unsigned NumConstantArgs;
+  unsigned NumConstantOffsetPtrArgs;
+  unsigned NumAllocaArgs;
+  unsigned NumConstantPtrCmps;
+  unsigned NumConstantPtrDiffs;
+  unsigned NumInstructionsSimplified;
+  unsigned SROACostSavings;
+  unsigned SROACostSavingsLost;
 
-      // And any other instructions that use it which become constants
-      // themselves.
-      Worklist.push_back(&Inst);
-    }
-  } while (!Worklist.empty());
-  return Reduction;
-}
+  void dump();
+};
 
-static unsigned countCodeReductionForAllocaICmp(const CodeMetrics &Metrics,
-                                                ICmpInst *ICI) {
-  unsigned Reduction = 0;
+} // namespace
 
-  // Bail if this is comparing against a non-constant; there is nothing we can
-  // do there.
-  if (!isa<Constant>(ICI->getOperand(1)))
-    return Reduction;
+/// \brief Test whether the given value is an Alloca-derived function argument.
+bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
+  return SROAArgValues.count(V);
+}
 
-  // An icmp pred (alloca, C) becomes true if the predicate is true when
-  // equal and false otherwise.
-  bool Result = ICI->isTrueWhenEqual();
+/// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
+/// Returns false if V does not map to a SROA-candidate.
+bool CallAnalyzer::lookupSROAArgAndCost(
+    Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
+  if (SROAArgValues.empty() || SROAArgCosts.empty())
+    return false;
 
-  SmallVector<Instruction *, 4> Worklist;
-  Worklist.push_back(ICI);
-  do {
-    Instruction *U = Worklist.pop_back_val();
-    Reduction += InlineConstants::InstrCost;
-    for (Value::use_iterator UI = U->use_begin(), UE = U->use_end();
-         UI != UE; ++UI) {
-      Instruction *I = dyn_cast<Instruction>(*UI);
-      if (!I || I->mayHaveSideEffects()) continue;
-      if (I->getNumOperands() == 1)
-        Worklist.push_back(I);
-      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-        // If BO produces the same value as U, then the other operand is
-        // irrelevant and we can put it into the Worklist to continue
-        // deleting dead instructions. If BO produces the same value as the
-        // other operand, we can delete BO but that's it.
-        if (Result == true) {
-          if (BO->getOpcode() == Instruction::Or)
-            Worklist.push_back(I);
-          if (BO->getOpcode() == Instruction::And)
-            Reduction += InlineConstants::InstrCost;
-        } else {
-          if (BO->getOpcode() == Instruction::Or ||
-              BO->getOpcode() == Instruction::Xor)
-            Reduction += InlineConstants::InstrCost;
-          if (BO->getOpcode() == Instruction::And)
-            Worklist.push_back(I);
-        }
-      }
-      if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-        BasicBlock *BB = BI->getSuccessor(Result ? 0 : 1);
-        if (BB->getSinglePredecessor())
-          Reduction
-            += InlineConstants::InstrCost * Metrics.NumBBInsts.lookup(BB);
-      }
-    }
-  } while (!Worklist.empty());
+  DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
+  if (ArgIt == SROAArgValues.end())
+    return false;
 
-  return Reduction;
+  Arg = ArgIt->second;
+  CostIt = SROAArgCosts.find(Arg);
+  return CostIt != SROAArgCosts.end();
 }
 
-/// \brief Compute the reduction possible for a given instruction if we are able
-/// to SROA an alloca.
+/// \brief Disable SROA for the candidate marked by this cost iterator.
 ///
-/// The reduction for this instruction is added to the SROAReduction output
-/// parameter. Returns false if this instruction is expected to defeat SROA in
-/// general.
-static bool countCodeReductionForSROAInst(Instruction *I,
-                                          SmallVectorImpl<Value *> &Worklist,
-                                          unsigned &SROAReduction) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (!LI->isSimple())
-      return false;
-    SROAReduction += InlineConstants::InstrCost;
+/// This markes the candidate as no longer viable for SROA, and adds the cost
+/// savings associated with it back into the inline cost measurement.
+void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
+  // If we're no longer able to perform SROA we need to undo its cost savings
+  // and prevent subsequent analysis.
+  Cost += CostIt->second;
+  SROACostSavings -= CostIt->second;
+  SROACostSavingsLost += CostIt->second;
+  SROAArgCosts.erase(CostIt);
+}
+
+/// \brief If 'V' maps to a SROA candidate, disable SROA for it.
+void CallAnalyzer::disableSROA(Value *V) {
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(V, SROAArg, CostIt))
+    disableSROA(CostIt);
+}
+
+/// \brief Accumulate the given cost for a particular SROA candidate.
+void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+                                      int InstructionCost) {
+  CostIt->second += InstructionCost;
+  SROACostSavings += InstructionCost;
+}
+
+/// \brief Helper for the common pattern of handling a SROA candidate.
+/// Either accumulates the cost savings if the SROA remains valid, or disables
+/// SROA for the candidate.
+bool CallAnalyzer::handleSROACandidate(bool IsSROAValid,
+                                       DenseMap<Value *, int>::iterator CostIt,
+                                       int InstructionCost) {
+  if (IsSROAValid) {
+    accumulateSROACost(CostIt, InstructionCost);
     return true;
   }
 
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (!SI->isSimple())
+  disableSROA(CostIt);
+  return false;
+}
+
+/// \brief Check whether a GEP's indices are all constant.
+///
+/// Respects any simplified values known during the analysis of this callsite.
+bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
+  for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
+    if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
       return false;
-    SROAReduction += InlineConstants::InstrCost;
-    return true;
-  }
 
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
-    // If the GEP has variable indices, we won't be able to do much with it.
-    if (!GEP->hasAllConstantIndices())
+  return true;
+}
+
+/// \brief Accumulate a constant GEP offset into an APInt if possible.
+///
+/// Returns false if unable to compute the offset for any reason. Respects any
+/// simplified values known during the analysis of this callsite.
+bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
+  if (!TD)
+    return false;
+
+  unsigned IntPtrWidth = TD->getPointerSizeInBits();
+  assert(IntPtrWidth == Offset.getBitWidth());
+
+  for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
+       GTI != GTE; ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+    if (!OpC)
+      if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
+        OpC = dyn_cast<ConstantInt>(SimpleOp);
+    if (!OpC)
       return false;
-    // A non-zero GEP will likely become a mask operation after SROA.
-    if (GEP->hasAllZeroIndices())
-      SROAReduction += InlineConstants::InstrCost;
-    Worklist.push_back(GEP);
-    return true;
-  }
+    if (OpC->isZero()) continue;
 
-  if (BitCastInst *BCI = dyn_cast<BitCastInst>(I)) {
-    // Track pointer through bitcasts.
-    Worklist.push_back(BCI);
-    SROAReduction += InlineConstants::InstrCost;
-    return true;
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+      unsigned ElementIdx = OpC->getZExtValue();
+      const StructLayout *SL = TD->getStructLayout(STy);
+      Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
+      continue;
+    }
+
+    APInt TypeSize(IntPtrWidth, TD->getTypeAllocSize(GTI.getIndexedType()));
+    Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
   }
+  return true;
+}
+
+bool CallAnalyzer::visitAlloca(AllocaInst &I) {
+  // FIXME: Check whether inlining will turn a dynamic alloca into a static
+  // alloca, and handle that case.
+
+  // We will happily inline static alloca instructions or dynamic alloca
+  // instructions in always-inline situations.
+  if (AlwaysInline || I.isStaticAlloca())
+    return Base::visitAlloca(I);
+
+  // FIXME: This is overly conservative. Dynamic allocas are inefficient for
+  // a variety of reasons, and so we would like to not inline them into
+  // functions which don't currently have a dynamic alloca. This simply
+  // disables inlining altogether in the presence of a dynamic alloca.
+  HasDynamicAlloca = true;
+  return false;
+}
 
-  // We just look for non-constant operands to ICmp instructions as those will
-  // defeat SROA. The actual reduction for these happens even without SROA.
-  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
-    return isa<Constant>(ICI->getOperand(1));
-
-  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-    // SROA can handle a select of alloca iff all uses of the alloca are
-    // loads, and dereferenceable. We assume it's dereferenceable since
-    // we're told the input is an alloca.
-    for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
-         UI != UE; ++UI) {
-      LoadInst *LI = dyn_cast<LoadInst>(*UI);
-      if (LI == 0 || !LI->isSimple())
+bool CallAnalyzer::visitPHI(PHINode &I) {
+  // FIXME: We should potentially be tracking values through phi nodes,
+  // especially when they collapse to a single value due to deleted CFG edges
+  // during inlining.
+
+  // FIXME: We need to propagate SROA *disabling* through phi nodes, even
+  // though we don't want to propagate it's bonuses. The idea is to disable
+  // SROA if it *might* be used in an inappropriate manner.
+
+  // Phi nodes are always zero-cost.
+  return true;
+}
+
+bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(),
+                                            SROAArg, CostIt);
+
+  // Try to fold GEPs of constant-offset call site argument pointers. This
+  // requires target data and inbounds GEPs.
+  if (TD && I.isInBounds()) {
+    // Check if we have a base + offset for the pointer.
+    Value *Ptr = I.getPointerOperand();
+    std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
+    if (BaseAndOffset.first) {
+      // Check if the offset of this GEP is constant, and if so accumulate it
+      // into Offset.
+      if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
+        // Non-constant GEPs aren't folded, and disable SROA.
+        if (SROACandidate)
+          disableSROA(CostIt);
         return false;
+      }
+
+      // Add the result as a new mapping to Base + Offset.
+      ConstantOffsetPtrs[&I] = BaseAndOffset;
+
+      // Also handle SROA candidates here, we already know that the GEP is
+      // all-constant indexed.
+      if (SROACandidate)
+        SROAArgValues[&I] = SROAArg;
+
+      return true;
     }
-    // We don't know whether we'll be deleting the rest of the chain of
-    // instructions from the SelectInst on, because we don't know whether
-    // the other side of the select is also an alloca or not.
+  }
+
+  if (isGEPOffsetConstant(I)) {
+    if (SROACandidate)
+      SROAArgValues[&I] = SROAArg;
+
+    // Constant GEPs are modeled as free.
     return true;
   }
 
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-    switch (II->getIntrinsicID()) {
-    default:
-      return false;
-    case Intrinsic::memset:
-    case Intrinsic::memcpy:
-    case Intrinsic::memmove:
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-      // SROA can usually chew through these intrinsics.
-      SROAReduction += InlineConstants::InstrCost;
+  // Variable GEPs will require math and will disable SROA.
+  if (SROACandidate)
+    disableSROA(CostIt);
+  return false;
+}
+
+bool CallAnalyzer::visitBitCast(BitCastInst &I) {
+  // Propagate constants through bitcasts.
+  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+    if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
+
+  // Track base/offsets through casts
+  std::pair<Value *, APInt> BaseAndOffset
+    = ConstantOffsetPtrs.lookup(I.getOperand(0));
+  // Casts don't change the offset, just wrap it up.
+  if (BaseAndOffset.first)
+    ConstantOffsetPtrs[&I] = BaseAndOffset;
+
+  // Also look for SROA candidates here.
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
+    SROAArgValues[&I] = SROAArg;
+
+  // Bitcasts are always zero cost.
+  return true;
+}
+
+bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
+  // Propagate constants through ptrtoint.
+  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+    if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
+      SimplifiedValues[&I] = C;
       return true;
     }
+
+  // Track base/offset pairs when converted to a plain integer provided the
+  // integer is large enough to represent the pointer.
+  unsigned IntegerSize = I.getType()->getScalarSizeInBits();
+  if (TD && IntegerSize >= TD->getPointerSizeInBits()) {
+    std::pair<Value *, APInt> BaseAndOffset
+      = ConstantOffsetPtrs.lookup(I.getOperand(0));
+    if (BaseAndOffset.first)
+      ConstantOffsetPtrs[&I] = BaseAndOffset;
   }
 
-  // If there is some other strange instruction, we're not going to be
-  // able to do much if we inline this.
-  return false;
+  // This is really weird. Technically, ptrtoint will disable SROA. However,
+  // unless that ptrtoint is *used* somewhere in the live basic blocks after
+  // inlining, it will be nuked, and SROA should proceed. All of the uses which
+  // would block SROA would also block SROA if applied directly to a pointer,
+  // and so we can just add the integer in here. The only places where SROA is
+  // preserved either cannot fire on an integer, or won't in-and-of themselves
+  // disable SROA (ext) w/o some later use that we would see and disable.
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
+    SROAArgValues[&I] = SROAArg;
+
+  // A ptrtoint cast is free so long as the result is large enough to store the
+  // pointer, and a legal integer type.
+  return TD && TD->isLegalInteger(IntegerSize) &&
+         IntegerSize >= TD->getPointerSizeInBits();
 }
 
-unsigned InlineCostAnalyzer::FunctionInfo::countCodeReductionForAlloca(
-    const CodeMetrics &Metrics, Value *V) {
-  if (!V->getType()->isPointerTy()) return 0;  // Not a pointer
-  unsigned Reduction = 0;
-  unsigned SROAReduction = 0;
-  bool CanSROAAlloca = true;
+bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
+  // Propagate constants through ptrtoint.
+  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+    if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
 
-  SmallVector<Value *, 4> Worklist;
-  Worklist.push_back(V);
-  do {
-    Value *V = Worklist.pop_back_val();
-    for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
-         UI != E; ++UI){
-      Instruction *I = cast<Instruction>(*UI);
+  // Track base/offset pairs when round-tripped through a pointer without
+  // modifications provided the integer is not too large.
+  Value *Op = I.getOperand(0);
+  unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
+  if (TD && IntegerSize <= TD->getPointerSizeInBits()) {
+    std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
+    if (BaseAndOffset.first)
+      ConstantOffsetPtrs[&I] = BaseAndOffset;
+  }
 
-      if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
-        Reduction += countCodeReductionForAllocaICmp(Metrics, ICI);
+  // "Propagate" SROA here in the same manner as we do for ptrtoint above.
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
+    SROAArgValues[&I] = SROAArg;
 
-      if (CanSROAAlloca)
-        CanSROAAlloca = countCodeReductionForSROAInst(I, Worklist,
-                                                      SROAReduction);
+  // An inttoptr cast is free so long as the input is a legal integer type
+  // which doesn't contain values outside the range of a pointer.
+  return TD && TD->isLegalInteger(IntegerSize) &&
+         IntegerSize <= TD->getPointerSizeInBits();
+}
+
+bool CallAnalyzer::visitCastInst(CastInst &I) {
+  // Propagate constants through ptrtoint.
+  if (Constant *COp = dyn_cast<Constant>(I.getOperand(0)))
+    if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
+      SimplifiedValues[&I] = C;
+      return true;
     }
-  } while (!Worklist.empty());
 
-  return Reduction + (CanSROAAlloca ? SROAReduction : 0);
+  // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
+  disableSROA(I.getOperand(0));
+
+  // No-op casts don't have any cost.
+  if (I.isLosslessCast())
+    return true;
+
+  // trunc to a native type is free (assuming the target has compare and
+  // shift-right of the same width).
+  if (TD && isa<TruncInst>(I) &&
+      TD->isLegalInteger(TD->getTypeSizeInBits(I.getType())))
+    return true;
+
+  // Result of a cmp instruction is often extended (to be used by other
+  // cmp instructions, logical or return instructions). These are usually
+  // no-ops on most sane targets.
+  if (isa<CmpInst>(I.getOperand(0)))
+    return true;
+
+  // Assume the rest of the casts require work.
+  return false;
 }
 
-void InlineCostAnalyzer::FunctionInfo::countCodeReductionForPointerPair(
-    const CodeMetrics &Metrics, DenseMap<Value *, unsigned> &PointerArgs,
-    Value *V, unsigned ArgIdx) {
-  SmallVector<Value *, 4> Worklist;
-  Worklist.push_back(V);
-  do {
-    Value *V = Worklist.pop_back_val();
-    for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
-         UI != E; ++UI){
-      Instruction *I = cast<Instruction>(*UI);
-
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
-        // If the GEP has variable indices, we won't be able to do much with it.
-        if (!GEP->hasAllConstantIndices())
-          continue;
-        // Unless the GEP is in-bounds, some comparisons will be non-constant.
-        // Fortunately, the real-world cases where this occurs uses in-bounds
-        // GEPs, and so we restrict the optimization to them here.
-        if (!GEP->isInBounds())
-          continue;
+bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
+  Value *Operand = I.getOperand(0);
+  Constant *Ops[1] = { dyn_cast<Constant>(Operand) };
+  if (Ops[0] || (Ops[0] = SimplifiedValues.lookup(Operand)))
+    if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
+                                               Ops, TD)) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
 
-        // Constant indices just change the constant offset. Add the resulting
-        // value both to our worklist for this argument, and to the set of
-        // viable paired values with future arguments.
-        PointerArgs[GEP] = ArgIdx;
-        Worklist.push_back(GEP);
-        continue;
+  // Disable any SROA on the argument to arbitrary unary operators.
+  disableSROA(Operand);
+
+  return false;
+}
+
+bool CallAnalyzer::visitICmp(ICmpInst &I) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  // First try to handle simplified comparisons.
+  if (!isa<Constant>(LHS))
+    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+      LHS = SimpleLHS;
+  if (!isa<Constant>(RHS))
+    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+      RHS = SimpleRHS;
+  if (Constant *CLHS = dyn_cast<Constant>(LHS))
+    if (Constant *CRHS = dyn_cast<Constant>(RHS))
+      if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
+        SimplifiedValues[&I] = C;
+        return true;
       }
 
-      // Track pointer through casts. Even when the result is not a pointer, it
-      // remains a constant relative to constants derived from other constant
-      // pointers.
-      if (CastInst *CI = dyn_cast<CastInst>(I)) {
-        PointerArgs[CI] = ArgIdx;
-        Worklist.push_back(CI);
-        continue;
+  // Otherwise look for a comparison between constant offset pointers with
+  // a common base.
+  Value *LHSBase, *RHSBase;
+  APInt LHSOffset, RHSOffset;
+  llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
+  if (LHSBase) {
+    llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
+    if (RHSBase && LHSBase == RHSBase) {
+      // We have common bases, fold the icmp to a constant based on the
+      // offsets.
+      Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
+      Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
+      if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
+        SimplifiedValues[&I] = C;
+        ++NumConstantPtrCmps;
+        return true;
       }
+    }
+  }
 
-      // There are two instructions which produce a strict constant value when
-      // applied to two related pointer values. Ignore everything else.
-      if (!isa<ICmpInst>(I) && I->getOpcode() != Instruction::Sub)
-        continue;
-      assert(I->getNumOperands() == 2);
-
-      // Ensure that the two operands are in our set of potentially paired
-      // pointers (or are derived from them).
-      Value *OtherArg = I->getOperand(0);
-      if (OtherArg == V)
-        OtherArg = I->getOperand(1);
-      DenseMap<Value *, unsigned>::const_iterator ArgIt
-        = PointerArgs.find(OtherArg);
-      if (ArgIt == PointerArgs.end())
-        continue;
-      std::pair<unsigned, unsigned> ArgPair(ArgIt->second, ArgIdx);
-      if (ArgPair.first > ArgPair.second)
-        std::swap(ArgPair.first, ArgPair.second);
+  // If the comparison is an equality comparison with null, we can simplify it
+  // for any alloca-derived argument.
+  if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)))
+    if (isAllocaDerivedArg(I.getOperand(0))) {
+      // We can actually predict the result of comparisons between an
+      // alloca-derived value and null. Note that this fires regardless of
+      // SROA firing.
+      bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
+      SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
+                                        : ConstantInt::getFalse(I.getType());
+      return true;
+    }
 
-      PointerArgPairWeights[ArgPair]
-        += countCodeReductionForConstant(Metrics, I);
+  // Finally check for SROA candidates in comparisons.
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+    if (isa<ConstantPointerNull>(I.getOperand(1))) {
+      accumulateSROACost(CostIt, InlineConstants::InstrCost);
+      return true;
     }
-  } while (!Worklist.empty());
-}
 
-/// analyzeFunction - Fill in the current structure with information gleaned
-/// from the specified function.
-void InlineCostAnalyzer::FunctionInfo::analyzeFunction(Function *F,
-                                                       const TargetData *TD) {
-  Metrics.analyzeFunction(F, TD);
-
-  // A function with exactly one return has it removed during the inlining
-  // process (see InlineFunction), so don't count it.
-  // FIXME: This knowledge should really be encoded outside of FunctionInfo.
-  if (Metrics.NumRets==1)
-    --Metrics.NumInsts;
-
-  ArgumentWeights.reserve(F->arg_size());
-  DenseMap<Value *, unsigned> PointerArgs;
-  unsigned ArgIdx = 0;
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
-       ++I, ++ArgIdx) {
-    // Count how much code can be eliminated if one of the arguments is
-    // a constant or an alloca.
-    ArgumentWeights.push_back(ArgInfo(countCodeReductionForConstant(Metrics, I),
-                                      countCodeReductionForAlloca(Metrics, I)));
-
-    // If the argument is a pointer, also check for pairs of pointers where
-    // knowing a fixed offset between them allows simplification. This pattern
-    // arises mostly due to STL algorithm patterns where pointers are used as
-    // random access iterators.
-    if (!I->getType()->isPointerTy())
-      continue;
-    PointerArgs[I] = ArgIdx;
-    countCodeReductionForPointerPair(Metrics, PointerArgs, I, ArgIdx);
+    disableSROA(CostIt);
   }
-}
 
-/// NeverInline - returns true if the function should never be inlined into
-/// any caller
-bool InlineCostAnalyzer::FunctionInfo::NeverInline() {
-  return (Metrics.exposesReturnsTwice || Metrics.isRecursive ||
-          Metrics.containsIndirectBr);
+  return false;
 }
 
-// ConstantFunctionBonus - Figure out how much of a bonus we can get for
-// possibly devirtualizing a function. We'll subtract the size of the function
-// we may wish to inline from the indirect call bonus providing a limit on
-// growth. Leave an upper limit of 0 for the bonus - we don't want to penalize
-// inlining because we decide we don't want to give a bonus for
-// devirtualizing.
-int InlineCostAnalyzer::ConstantFunctionBonus(CallSite CS, Constant *C) {
+bool CallAnalyzer::visitSub(BinaryOperator &I) {
+  // Try to handle a special case: we can fold computing the difference of two
+  // constant-related pointers.
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  Value *LHSBase, *RHSBase;
+  APInt LHSOffset, RHSOffset;
+  llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
+  if (LHSBase) {
+    llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
+    if (RHSBase && LHSBase == RHSBase) {
+      // We have common bases, fold the subtract to a constant based on the
+      // offsets.
+      Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
+      Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
+      if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
+        SimplifiedValues[&I] = C;
+        ++NumConstantPtrDiffs;
+        return true;
+      }
+    }
+  }
+
+  // Otherwise, fall back to the generic logic for simplifying and handling
+  // instructions.
+  return Base::visitSub(I);
+}
 
-  // This could just be NULL.
-  if (!C) return 0;
+bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  if (!isa<Constant>(LHS))
+    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+      LHS = SimpleLHS;
+  if (!isa<Constant>(RHS))
+    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+      RHS = SimpleRHS;
+  Value *SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, TD);
+  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
+    SimplifiedValues[&I] = C;
+    return true;
+  }
 
-  Function *F = dyn_cast<Function>(C);
-  if (!F) return 0;
+  // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
+  disableSROA(LHS);
+  disableSROA(RHS);
 
-  int Bonus = InlineConstants::IndirectCallBonus + getInlineSize(CS, F);
-  return (Bonus > 0) ? 0 : Bonus;
+  return false;
 }
 
-// CountBonusForConstant - Figure out an approximation for how much per-call
-// performance boost we can expect if the specified value is constant.
-int InlineCostAnalyzer::CountBonusForConstant(Value *V, Constant *C) {
-  unsigned Bonus = 0;
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
-    User *U = *UI;
-    if (CallInst *CI = dyn_cast<CallInst>(U)) {
-      // Turning an indirect call into a direct call is a BIG win
-      if (CI->getCalledValue() == V)
-        Bonus += ConstantFunctionBonus(CallSite(CI), C);
-    } else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) {
-      // Turning an indirect call into a direct call is a BIG win
-      if (II->getCalledValue() == V)
-        Bonus += ConstantFunctionBonus(CallSite(II), C);
+bool CallAnalyzer::visitLoad(LoadInst &I) {
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+    if (I.isSimple()) {
+      accumulateSROACost(CostIt, InlineConstants::InstrCost);
+      return true;
     }
-    // FIXME: Eliminating conditional branches and switches should
-    // also yield a per-call performance boost.
-    else {
-      // Figure out the bonuses that wll accrue due to simple constant
-      // propagation.
-      Instruction &Inst = cast<Instruction>(*U);
-
-      // We can't constant propagate instructions which have effects or
-      // read memory.
-      //
-      // FIXME: It would be nice to capture the fact that a load from a
-      // pointer-to-constant-global is actually a *really* good thing to zap.
-      // Unfortunately, we don't know the pointer that may get propagated here,
-      // so we can't make this decision.
-      if (Inst.mayReadFromMemory() || Inst.mayHaveSideEffects() ||
-          isa<AllocaInst>(Inst))
-        continue;
 
-      bool AllOperandsConstant = true;
-      for (unsigned i = 0, e = Inst.getNumOperands(); i != e; ++i)
-        if (!isa<Constant>(Inst.getOperand(i)) && Inst.getOperand(i) != V) {
-          AllOperandsConstant = false;
-          break;
-        }
+    disableSROA(CostIt);
+  }
 
-      if (AllOperandsConstant)
-        Bonus += CountBonusForConstant(&Inst);
+  return false;
+}
+
+bool CallAnalyzer::visitStore(StoreInst &I) {
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+    if (I.isSimple()) {
+      accumulateSROACost(CostIt, InlineConstants::InstrCost);
+      return true;
     }
+
+    disableSROA(CostIt);
   }
 
-  return Bonus;
+  return false;
 }
 
-int InlineCostAnalyzer::getInlineSize(CallSite CS, Function *Callee) {
-  // Get information about the callee.
-  FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
-
-  // If we haven't calculated this information yet, do so now.
-  if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee, TD);
-
-  // InlineCost - This value measures how good of an inline candidate this call
-  // site is to inline.  A lower inline cost make is more likely for the call to
-  // be inlined.  This value may go negative.
-  //
-  int InlineCost = 0;
-
-  // Compute any size reductions we can expect due to arguments being passed into
-  // the function.
-  //
-  unsigned ArgNo = 0;
-  CallSite::arg_iterator I = CS.arg_begin();
-  for (Function::arg_iterator FI = Callee->arg_begin(), FE = Callee->arg_end();
-       FI != FE; ++I, ++FI, ++ArgNo) {
-
-    // If an alloca is passed in, inlining this function is likely to allow
-    // significant future optimization possibilities (like scalar promotion, and
-    // scalarization), so encourage the inlining of the function.
-    //
-    if (isa<AllocaInst>(I))
-      InlineCost -= CalleeFI->ArgumentWeights[ArgNo].AllocaWeight;
-
-    // If this is a constant being passed into the function, use the argument
-    // weights calculated for the callee to determine how much will be folded
-    // away with this information.
-    else if (isa<Constant>(I))
-      InlineCost -= CalleeFI->ArgumentWeights[ArgNo].ConstantWeight;
+bool CallAnalyzer::visitCallSite(CallSite CS) {
+  if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() &&
+      !F.hasFnAttr(Attribute::ReturnsTwice)) {
+    // This aborts the entire analysis.
+    ExposesReturnsTwice = true;
+    return false;
+  }
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return Base::visitCallSite(CS);
+
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+    case Intrinsic::objectsize:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+      // SROA can usually chew through these intrinsics and they have no cost
+      // so don't pay the price of analyzing them in detail.
+      return true;
+    }
   }
 
-  const DenseMap<std::pair<unsigned, unsigned>, unsigned> &ArgPairWeights
-    = CalleeFI->PointerArgPairWeights;
-  for (DenseMap<std::pair<unsigned, unsigned>, unsigned>::const_iterator I
-         = ArgPairWeights.begin(), E = ArgPairWeights.end();
-       I != E; ++I)
-    if (CS.getArgument(I->first.first)->stripInBoundsConstantOffsets() ==
-        CS.getArgument(I->first.second)->stripInBoundsConstantOffsets())
-      InlineCost -= I->second;
+  if (Function *F = CS.getCalledFunction()) {
+    if (F == CS.getInstruction()->getParent()->getParent()) {
+      // This flag will fully abort the analysis, so don't bother with anything
+      // else.
+      IsRecursive = true;
+      return false;
+    }
 
-  // Each argument passed in has a cost at both the caller and the callee
-  // sides.  Measurements show that each argument costs about the same as an
-  // instruction.
-  InlineCost -= (CS.arg_size() * InlineConstants::InstrCost);
+    if (!callIsSmall(F)) {
+      // We account for the average 1 instruction per call argument setup
+      // here.
+      Cost += CS.arg_size() * InlineConstants::InstrCost;
 
-  // Now that we have considered all of the factors that make the call site more
-  // likely to be inlined, look at factors that make us not want to inline it.
+      // Everything other than inline ASM will also have a significant cost
+      // merely from making the call.
+      if (!isa<InlineAsm>(CS.getCalledValue()))
+        Cost += InlineConstants::CallPenalty;
+    }
 
-  // Calls usually take a long time, so they make the inlining gain smaller.
-  InlineCost += CalleeFI->Metrics.NumCalls * InlineConstants::CallPenalty;
+    return Base::visitCallSite(CS);
+  }
 
-  // Look at the size of the callee. Each instruction counts as 5.
-  InlineCost += CalleeFI->Metrics.NumInsts * InlineConstants::InstrCost;
+  // Otherwise we're in a very special case -- an indirect function call. See
+  // if we can be particularly clever about this.
+  Value *Callee = CS.getCalledValue();
+
+  // First, pay the price of the argument setup. We account for the average
+  // 1 instruction per call argument setup here.
+  Cost += CS.arg_size() * InlineConstants::InstrCost;
+
+  // Next, check if this happens to be an indirect function call to a known
+  // function in this inline context. If not, we've done all we can.
+  Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
+  if (!F)
+    return Base::visitCallSite(CS);
+
+  // If we have a constant that we are calling as a function, we can peer
+  // through it and see the function target. This happens not infrequently
+  // during devirtualization and so we want to give it a hefty bonus for
+  // inlining, but cap that bonus in the event that inlining wouldn't pan
+  // out. Pretend to inline the function, with a custom threshold.
+  CallAnalyzer CA(TD, *F, InlineConstants::IndirectCallThreshold);
+  if (CA.analyzeCall(CS)) {
+    // We were able to inline the indirect call! Subtract the cost from the
+    // bonus we want to apply, but don't go below zero.
+    Cost -= std::max(0, InlineConstants::IndirectCallThreshold - CA.getCost());
+  }
 
-  return InlineCost;
+  return Base::visitCallSite(CS);
 }
 
-int InlineCostAnalyzer::getInlineBonuses(CallSite CS, Function *Callee) {
-  // Get information about the callee.
-  FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
-
-  // If we haven't calculated this information yet, do so now.
-  if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee, TD);
-
-  bool isDirectCall = CS.getCalledFunction() == Callee;
-  Instruction *TheCall = CS.getInstruction();
-  int Bonus = 0;
-
-  // If there is only one call of the function, and it has internal linkage,
-  // make it almost guaranteed to be inlined.
-  //
-  if (Callee->hasLocalLinkage() && Callee->hasOneUse() && isDirectCall)
-    Bonus += InlineConstants::LastCallToStaticBonus;
-
-  // If the instruction after the call, or if the normal destination of the
-  // invoke is an unreachable instruction, the function is noreturn.  As such,
-  // there is little point in inlining this.
-  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
-    if (isa<UnreachableInst>(II->getNormalDest()->begin()))
-      Bonus += InlineConstants::NoreturnPenalty;
-  } else if (isa<UnreachableInst>(++BasicBlock::iterator(TheCall)))
-    Bonus += InlineConstants::NoreturnPenalty;
-
-  // If this function uses the coldcc calling convention, prefer not to inline
-  // it.
-  if (Callee->getCallingConv() == CallingConv::Cold)
-    Bonus += InlineConstants::ColdccPenalty;
-
-  // Add to the inline quality for properties that make the call valuable to
-  // inline.  This includes factors that indicate that the result of inlining
-  // the function will be optimizable.  Currently this just looks at arguments
-  // passed into the function.
-  //
-  CallSite::arg_iterator I = CS.arg_begin();
-  for (Function::arg_iterator FI = Callee->arg_begin(), FE = Callee->arg_end();
-       FI != FE; ++I, ++FI)
-    // Compute any constant bonus due to inlining we want to give here.
-    if (isa<Constant>(I))
-      Bonus += CountBonusForConstant(FI, cast<Constant>(I));
-
-  return Bonus;
+bool CallAnalyzer::visitInstruction(Instruction &I) {
+  // We found something we don't understand or can't handle. Mark any SROA-able
+  // values in the operand list as no longer viable.
+  for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
+    disableSROA(*OI);
+
+  return false;
 }
 
-// getInlineCost - The heuristic used to determine if we should inline the
-// function call or not.
-//
-InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS) {
-  return getInlineCost(CS, CS.getCalledFunction());
+
+/// \brief Analyze a basic block for its contribution to the inline cost.
+///
+/// This method walks the analyzer over every instruction in the given basic
+/// block and accounts for their cost during inlining at this callsite. It
+/// aborts early if the threshold has been exceeded or an impossible to inline
+/// construct has been detected. It returns false if inlining is no longer
+/// viable, and true if inlining remains viable.
+bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
+  for (BasicBlock::iterator I = BB->begin(), E = llvm::prior(BB->end());
+       I != E; ++I) {
+    ++NumInstructions;
+    if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
+      ++NumVectorInstructions;
+
+    // If the instruction simplified to a constant, there is no cost to this
+    // instruction. Visit the instructions using our InstVisitor to account for
+    // all of the per-instruction logic. The visit tree returns true if we
+    // consumed the instruction in any way, and false if the instruction's base
+    // cost should count against inlining.
+    if (Base::visit(I))
+      ++NumInstructionsSimplified;
+    else
+      Cost += InlineConstants::InstrCost;
+
+    // If the visit this instruction detected an uninlinable pattern, abort.
+    if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+      return false;
+
+    if (NumVectorInstructions > NumInstructions/2)
+      VectorBonus = FiftyPercentVectorBonus;
+    else if (NumVectorInstructions > NumInstructions/10)
+      VectorBonus = TenPercentVectorBonus;
+    else
+      VectorBonus = 0;
+
+    // Check if we've past the threshold so we don't spin in huge basic
+    // blocks that will never inline.
+    if (!AlwaysInline && Cost > (Threshold + VectorBonus))
+      return false;
+  }
+
+  return true;
 }
 
-InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee) {
-  Instruction *TheCall = CS.getInstruction();
-  Function *Caller = TheCall->getParent()->getParent();
+/// \brief Compute the base pointer and cumulative constant offsets for V.
+///
+/// This strips all constant offsets off of V, leaving it the base pointer, and
+/// accumulates the total constant offset applied in the returned constant. It
+/// returns 0 if V is not a pointer, and returns the constant '0' if there are
+/// no constant offsets applied.
+ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
+  if (!TD || !V->getType()->isPointerTy())
+    return 0;
+
+  unsigned IntPtrWidth = TD->getPointerSizeInBits();
+  APInt Offset = APInt::getNullValue(IntPtrWidth);
+
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(V);
+  do {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
+        return 0;
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      if (GA->mayBeOverridden())
+        break;
+      V = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(V));
 
-  // Don't inline functions which can be redefined at link-time to mean
-  // something else.  Don't inline functions marked noinline or call sites
-  // marked noinline.
-  if (Callee->mayBeOverridden() || Callee->hasFnAttr(Attribute::NoInline) ||
-      CS.isNoInline())
-    return llvm::InlineCost::getNever();
+  Type *IntPtrTy = TD->getIntPtrType(V->getContext());
+  return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
+}
 
-  // Get information about the callee.
-  FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
+/// \brief Analyze a call site for potential inlining.
+///
+/// Returns true if inlining this call is viable, and false if it is not
+/// viable. It computes the cost and adjusts the threshold based on numerous
+/// factors and heuristics. If this method returns false but the computed cost
+/// is below the computed threshold, then inlining was forcibly disabled by
+/// some artifact of the rountine.
+bool CallAnalyzer::analyzeCall(CallSite CS) {
+  ++NumCallsAnalyzed;
+
+  // Track whether the post-inlining function would have more than one basic
+  // block. A single basic block is often intended for inlining. Balloon the
+  // threshold by 50% until we pass the single-BB phase.
+  bool SingleBB = true;
+  int SingleBBBonus = Threshold / 2;
+  Threshold += SingleBBBonus;
+
+  // Unless we are always-inlining, perform some tweaks to the cost and
+  // threshold based on the direct callsite information.
+  if (!AlwaysInline) {
+    // We want to more aggressively inline vector-dense kernels, so up the
+    // threshold, and we'll lower it if the % of vector instructions gets too
+    // low.
+    assert(NumInstructions == 0);
+    assert(NumVectorInstructions == 0);
+    FiftyPercentVectorBonus = Threshold;
+    TenPercentVectorBonus = Threshold / 2;
+
+    // Subtract off one instruction per call argument as those will be free after
+    // inlining.
+    Cost -= CS.arg_size() * InlineConstants::InstrCost;
+
+    // If there is only one call of the function, and it has internal linkage,
+    // the cost of inlining it drops dramatically.
+    if (F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction())
+      Cost += InlineConstants::LastCallToStaticBonus;
+
+    // If the instruction after the call, or if the normal destination of the
+    // invoke is an unreachable instruction, the function is noreturn.  As such,
+    // there is little point in inlining this unless there is literally zero cost.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      if (isa<UnreachableInst>(II->getNormalDest()->begin()))
+        Threshold = 1;
+    } else if (isa<UnreachableInst>(++BasicBlock::iterator(CS.getInstruction())))
+      Threshold = 1;
+
+    // If this function uses the coldcc calling convention, prefer not to inline
+    // it.
+    if (F.getCallingConv() == CallingConv::Cold)
+      Cost += InlineConstants::ColdccPenalty;
+
+    // Check if we're done. This can happen due to bonuses and penalties.
+    if (Cost > Threshold)
+      return false;
+  }
 
-  // If we haven't calculated this information yet, do so now.
-  if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee, TD);
+  if (F.empty())
+    return true;
 
-  // If we should never inline this, return a huge cost.
-  if (CalleeFI->NeverInline())
-    return InlineCost::getNever();
+  // Track whether we've seen a return instruction. The first return
+  // instruction is free, as at least one will usually disappear in inlining.
+  bool HasReturn = false;
+
+  // Populate our simplified values by mapping from function arguments to call
+  // arguments with known important simplifications.
+  CallSite::arg_iterator CAI = CS.arg_begin();
+  for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
+       FAI != FAE; ++FAI, ++CAI) {
+    assert(CAI != CS.arg_end());
+    if (Constant *C = dyn_cast<Constant>(CAI))
+      SimplifiedValues[FAI] = C;
+
+    Value *PtrArg = *CAI;
+    if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
+      ConstantOffsetPtrs[FAI] = std::make_pair(PtrArg, C->getValue());
+
+      // We can SROA any pointer arguments derived from alloca instructions.
+      if (isa<AllocaInst>(PtrArg)) {
+        SROAArgValues[FAI] = PtrArg;
+        SROAArgCosts[PtrArg] = 0;
+      }
+    }
+  }
+  NumConstantArgs = SimplifiedValues.size();
+  NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
+  NumAllocaArgs = SROAArgValues.size();
+
+  // The worklist of live basic blocks in the callee *after* inlining. We avoid
+  // adding basic blocks of the callee which can be proven to be dead for this
+  // particular call site in order to get more accurate cost estimates. This
+  // requires a somewhat heavyweight iteration pattern: we need to walk the
+  // basic blocks in a breadth-first order as we insert live successors. To
+  // accomplish this, prioritizing for small iterations because we exit after
+  // crossing our threshold, we use a small-size optimized SetVector.
+  typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
+                                  SmallPtrSet<BasicBlock *, 16> > BBSetVector;
+  BBSetVector BBWorklist;
+  BBWorklist.insert(&F.getEntryBlock());
+  // Note that we *must not* cache the size, this loop grows the worklist.
+  for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
+    // Bail out the moment we cross the threshold. This means we'll under-count
+    // the cost, but only when undercounting doesn't matter.
+    if (!AlwaysInline && Cost > (Threshold + VectorBonus))
+      break;
+
+    BasicBlock *BB = BBWorklist[Idx];
+    if (BB->empty())
+      continue;
 
-  // FIXME: It would be nice to kill off CalleeFI->NeverInline. Then we
-  // could move this up and avoid computing the FunctionInfo for
-  // things we are going to just return always inline for. This
-  // requires handling setjmp somewhere else, however.
-  if (!Callee->isDeclaration() && Callee->hasFnAttr(Attribute::AlwaysInline))
-    return InlineCost::getAlways();
+    // Handle the terminator cost here where we can track returns and other
+    // function-wide constructs.
+    TerminatorInst *TI = BB->getTerminator();
+
+    // We never want to inline functions that contain an indirectbr.  This is
+    // incorrect because all the blockaddress's (in static global initializers
+    // for example) would be referring to the original function, and this indirect
+    // jump would jump from the inlined copy of the function into the original
+    // function which is extremely undefined behavior.
+    // FIXME: This logic isn't really right; we can safely inline functions
+    // with indirectbr's as long as no other function or global references the
+    // blockaddress of a block within the current function.  And as a QOI issue,
+    // if someone is using a blockaddress without an indirectbr, and that
+    // reference somehow ends up in another function or global, we probably
+    // don't want to inline this function.
+    if (isa<IndirectBrInst>(TI))
+      return false;
 
-  if (CalleeFI->Metrics.usesDynamicAlloca) {
-    // Get information about the caller.
-    FunctionInfo &CallerFI = CachedFunctionInfo[Caller];
+    if (!HasReturn && isa<ReturnInst>(TI))
+      HasReturn = true;
+    else
+      Cost += InlineConstants::InstrCost;
 
-    // If we haven't calculated this information yet, do so now.
-    if (CallerFI.Metrics.NumBlocks == 0) {
-      CallerFI.analyzeFunction(Caller, TD);
+    // Analyze the cost of this block. If we blow through the threshold, this
+    // returns false, and we can bail on out.
+    if (!analyzeBlock(BB)) {
+      if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+        return false;
+      break;
+    }
 
-      // Recompute the CalleeFI pointer, getting Caller could have invalidated
-      // it.
-      CalleeFI = &CachedFunctionInfo[Callee];
+    // Add in the live successors by first checking whether we have terminator
+    // that may be simplified based on the values simplified by this call.
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional()) {
+        Value *Cond = BI->getCondition();
+        if (ConstantInt *SimpleCond
+              = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+          BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
+          continue;
+        }
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *Cond = SI->getCondition();
+      if (ConstantInt *SimpleCond
+            = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+        BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
+        continue;
+      }
     }
 
-    // Don't inline a callee with dynamic alloca into a caller without them.
-    // Functions containing dynamic alloca's are inefficient in various ways;
-    // don't create more inefficiency.
-    if (!CallerFI.Metrics.usesDynamicAlloca)
-      return InlineCost::getNever();
+    // If we're unable to select a particular successor, just count all of
+    // them.
+    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; ++TIdx)
+      BBWorklist.insert(TI->getSuccessor(TIdx));
+
+    // If we had any successors at this point, than post-inlining is likely to
+    // have them as well. Note that we assume any basic blocks which existed
+    // due to branches or switches which folded above will also fold after
+    // inlining.
+    if (SingleBB && TI->getNumSuccessors() > 1) {
+      // Take off the bonus we applied to the threshold.
+      Threshold -= SingleBBBonus;
+      SingleBB = false;
+    }
   }
 
-  // InlineCost - This value measures how good of an inline candidate this call
-  // site is to inline.  A lower inline cost make is more likely for the call to
-  // be inlined.  This value may go negative due to the fact that bonuses
-  // are negative numbers.
-  //
-  int InlineCost = getInlineSize(CS, Callee) + getInlineBonuses(CS, Callee);
-  return llvm::InlineCost::get(InlineCost);
-}
+  Threshold += VectorBonus;
 
-// getInlineFudgeFactor - Return a > 1.0 factor if the inliner should use a
-// higher threshold to determine if the function call should be inlined.
-float InlineCostAnalyzer::getInlineFudgeFactor(CallSite CS) {
-  Function *Callee = CS.getCalledFunction();
-
-  // Get information about the callee.
-  FunctionInfo &CalleeFI = CachedFunctionInfo[Callee];
-
-  // If we haven't calculated this information yet, do so now.
-  if (CalleeFI.Metrics.NumBlocks == 0)
-    CalleeFI.analyzeFunction(Callee, TD);
-
-  float Factor = 1.0f;
-  // Single BB functions are often written to be inlined.
-  if (CalleeFI.Metrics.NumBlocks == 1)
-    Factor += 0.5f;
-
-  // Be more aggressive if the function contains a good chunk (if it mades up
-  // at least 10% of the instructions) of vector instructions.
-  if (CalleeFI.Metrics.NumVectorInsts > CalleeFI.Metrics.NumInsts/2)
-    Factor += 2.0f;
-  else if (CalleeFI.Metrics.NumVectorInsts > CalleeFI.Metrics.NumInsts/10)
-    Factor += 1.5f;
-  return Factor;
+  return AlwaysInline || Cost < Threshold;
 }
 
-/// growCachedCostInfo - update the cached cost info for Caller after Callee has
-/// been inlined.
-void
-InlineCostAnalyzer::growCachedCostInfo(Function *Caller, Function *Callee) {
-  CodeMetrics &CallerMetrics = CachedFunctionInfo[Caller].Metrics;
+/// \brief Dump stats about this call's analysis.
+void CallAnalyzer::dump() {
+#define DEBUG_PRINT_STAT(x) llvm::dbgs() << "      " #x ": " << x << "\n"
+  DEBUG_PRINT_STAT(NumConstantArgs);
+  DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
+  DEBUG_PRINT_STAT(NumAllocaArgs);
+  DEBUG_PRINT_STAT(NumConstantPtrCmps);
+  DEBUG_PRINT_STAT(NumConstantPtrDiffs);
+  DEBUG_PRINT_STAT(NumInstructionsSimplified);
+  DEBUG_PRINT_STAT(SROACostSavings);
+  DEBUG_PRINT_STAT(SROACostSavingsLost);
+#undef DEBUG_PRINT_STAT
+}
 
-  // For small functions we prefer to recalculate the cost for better accuracy.
-  if (CallerMetrics.NumBlocks < 10 && CallerMetrics.NumInsts < 1000) {
-    resetCachedCostInfo(Caller);
-    return;
-  }
+InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, int Threshold) {
+  return getInlineCost(CS, CS.getCalledFunction(), Threshold);
+}
 
-  // For large functions, we can save a lot of computation time by skipping
-  // recalculations.
-  if (CallerMetrics.NumCalls > 0)
-    --CallerMetrics.NumCalls;
+InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
+                                             int Threshold) {
+  // Don't inline functions which can be redefined at link-time to mean
+  // something else.  Don't inline functions marked noinline or call sites
+  // marked noinline.
+  if (!Callee || Callee->mayBeOverridden() ||
+      Callee->hasFnAttr(Attribute::NoInline) || CS.isNoInline())
+    return llvm::InlineCost::getNever();
 
-  if (Callee == 0) return;
+  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName() << "...\n");
 
-  CodeMetrics &CalleeMetrics = CachedFunctionInfo[Callee].Metrics;
+  CallAnalyzer CA(TD, *Callee, Threshold);
+  bool ShouldInline = CA.analyzeCall(CS);
 
-  // If we don't have metrics for the callee, don't recalculate them just to
-  // update an approximation in the caller.  Instead, just recalculate the
-  // caller info from scratch.
-  if (CalleeMetrics.NumBlocks == 0) {
-    resetCachedCostInfo(Caller);
-    return;
-  }
+  DEBUG(CA.dump());
 
-  // Since CalleeMetrics were already calculated, we know that the CallerMetrics
-  // reference isn't invalidated: both were in the DenseMap.
-  CallerMetrics.usesDynamicAlloca |= CalleeMetrics.usesDynamicAlloca;
-
-  // FIXME: If any of these three are true for the callee, the callee was
-  // not inlined into the caller, so I think they're redundant here.
-  CallerMetrics.exposesReturnsTwice |= CalleeMetrics.exposesReturnsTwice;
-  CallerMetrics.isRecursive |= CalleeMetrics.isRecursive;
-  CallerMetrics.containsIndirectBr |= CalleeMetrics.containsIndirectBr;
-
-  CallerMetrics.NumInsts += CalleeMetrics.NumInsts;
-  CallerMetrics.NumBlocks += CalleeMetrics.NumBlocks;
-  CallerMetrics.NumCalls += CalleeMetrics.NumCalls;
-  CallerMetrics.NumVectorInsts += CalleeMetrics.NumVectorInsts;
-  CallerMetrics.NumRets += CalleeMetrics.NumRets;
-
-  // analyzeBasicBlock counts each function argument as an inst.
-  if (CallerMetrics.NumInsts >= Callee->arg_size())
-    CallerMetrics.NumInsts -= Callee->arg_size();
-  else
-    CallerMetrics.NumInsts = 0;
-
-  // We are not updating the argument weights. We have already determined that
-  // Caller is a fairly large function, so we accept the loss of precision.
-}
+  // Check if there was a reason to force inlining or no inlining.
+  if (!ShouldInline && CA.getCost() < CA.getThreshold())
+    return InlineCost::getNever();
+  if (ShouldInline && CA.getCost() >= CA.getThreshold())
+    return InlineCost::getAlways();
 
-/// clear - empty the cache of inline costs
-void InlineCostAnalyzer::clear() {
-  CachedFunctionInfo.clear();
+  return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
 }
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 72e33d1..16e7a72 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -21,6 +21,7 @@
 #include "llvm/GlobalAlias.h"
 #include "llvm/Operator.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -709,7 +710,7 @@ static Constant *stripAndComputeConstantOffsets(const TargetData &TD,
   Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-      if (!accumulateGEPOffset(TD, GEP, Offset))
+      if (!GEP->isInBounds() || !accumulateGEPOffset(TD, GEP, Offset))
         break;
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
@@ -1590,6 +1591,45 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
   return 0;
 }
 
+static Constant *computePointerICmp(const TargetData &TD,
+                                    CmpInst::Predicate Pred,
+                                    Value *LHS, Value *RHS) {
+  // We can only fold certain predicates on pointer comparisons.
+  switch (Pred) {
+  default:
+    return 0;
+
+    // Equality comaprisons are easy to fold.
+  case CmpInst::ICMP_EQ:
+  case CmpInst::ICMP_NE:
+    break;
+
+    // We can only handle unsigned relational comparisons because 'inbounds' on
+    // a GEP only protects against unsigned wrapping.
+  case CmpInst::ICMP_UGT:
+  case CmpInst::ICMP_UGE:
+  case CmpInst::ICMP_ULT:
+  case CmpInst::ICMP_ULE:
+    // However, we have to switch them to their signed variants to handle
+    // negative indices from the base pointer.
+    Pred = ICmpInst::getSignedPredicate(Pred);
+    break;
+  }
+
+  Constant *LHSOffset = stripAndComputeConstantOffsets(TD, LHS);
+  if (!LHSOffset)
+    return 0;
+  Constant *RHSOffset = stripAndComputeConstantOffsets(TD, RHS);
+  if (!RHSOffset)
+    return 0;
+
+  // If LHS and RHS are not related via constant offsets to the same base
+  // value, there is nothing we can do here.
+  if (LHS != RHS)
+    return 0;
+
+  return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset);
+}
 
 /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
 /// fold the result.  If not, this returns null.
@@ -2310,7 +2350,12 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getFalse(ITy);
   }
 
-  // Simplify comparisons of GEPs.
+  // Simplify comparisons of related pointers using a powerful, recursive
+  // GEP-walk when we have target data available..
+  if (Q.TD && LHS->getType()->isPointerTy() && RHS->getType()->isPointerTy())
+    if (Constant *C = computePointerICmp(*Q.TD, Pred, LHS, RHS))
+      return C;
+
   if (GetElementPtrInst *GLHS = dyn_cast<GetElementPtrInst>(LHS)) {
     if (GEPOperator *GRHS = dyn_cast<GEPOperator>(RHS)) {
       if (GLHS->getPointerOperand() == GRHS->getPointerOperand() &&
@@ -2818,58 +2863,84 @@ Value *llvm::SimplifyInstruction(Instruction *I, const TargetData *TD,
   return Result == I ? UndefValue::get(I->getType()) : Result;
 }
 
-/// ReplaceAndSimplifyAllUses - Perform From->replaceAllUsesWith(To) and then
-/// delete the From instruction.  In addition to a basic RAUW, this does a
-/// recursive simplification of the newly formed instructions.  This catches
-/// things where one simplification exposes other opportunities.  This only
-/// simplifies and deletes scalar operations, it does not change the CFG.
+/// \brief Implementation of recursive simplification through an instructions
+/// uses.
 ///
-void llvm::ReplaceAndSimplifyAllUses(Instruction *From, Value *To,
-                                     const TargetData *TD,
-                                     const TargetLibraryInfo *TLI,
-                                     const DominatorTree *DT) {
-  assert(From != To && "ReplaceAndSimplifyAllUses(X,X) is not valid!");
-
-  // FromHandle/ToHandle - This keeps a WeakVH on the from/to values so that
-  // we can know if it gets deleted out from under us or replaced in a
-  // recursive simplification.
-  WeakVH FromHandle(From);
-  WeakVH ToHandle(To);
-
-  while (!From->use_empty()) {
-    // Update the instruction to use the new value.
-    Use &TheUse = From->use_begin().getUse();
-    Instruction *User = cast<Instruction>(TheUse.getUser());
-    TheUse = To;
-
-    // Check to see if the instruction can be folded due to the operand
-    // replacement.  For example changing (or X, Y) into (or X, -1) can replace
-    // the 'or' with -1.
-    Value *SimplifiedVal;
-    {
-      // Sanity check to make sure 'User' doesn't dangle across
-      // SimplifyInstruction.
-      AssertingVH<> UserHandle(User);
-
-      SimplifiedVal = SimplifyInstruction(User, TD, TLI, DT);
-      if (SimplifiedVal == 0) continue;
-    }
+/// This is the common implementation of the recursive simplification routines.
+/// If we have a pre-simplified value in 'SimpleV', that is forcibly used to
+/// replace the instruction 'I'. Otherwise, we simply add 'I' to the list of
+/// instructions to process and attempt to simplify it using
+/// InstructionSimplify.
+///
+/// This routine returns 'true' only when *it* simplifies something. The passed
+/// in simplified value does not count toward this.
+static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
+                                              const TargetData *TD,
+                                              const TargetLibraryInfo *TLI,
+                                              const DominatorTree *DT) {
+  bool Simplified = false;
+  SmallSetVector<Instruction *, 8> Worklist;
+
+  // If we have an explicit value to collapse to, do that round of the
+  // simplification loop by hand initially.
+  if (SimpleV) {
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
+         ++UI)
+      if (*UI != I)
+        Worklist.insert(cast<Instruction>(*UI));
+
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
+
+    // Gracefully handle edge cases where the instruction is not wired into any
+    // parent block.
+    if (I->getParent())
+      I->eraseFromParent();
+  } else {
+    Worklist.insert(I);
+  }
+
+  // Note that we must test the size on each iteration, the worklist can grow.
+  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+    I = Worklist[Idx];
+
+    // See if this instruction simplifies.
+    SimpleV = SimplifyInstruction(I, TD, TLI, DT);
+    if (!SimpleV)
+      continue;
+
+    Simplified = true;
 
-    // Recursively simplify this user to the new value.
-    ReplaceAndSimplifyAllUses(User, SimplifiedVal, TD, TLI, DT);
-    From = dyn_cast_or_null<Instruction>((Value*)FromHandle);
-    To = ToHandle;
+    // Stash away all the uses of the old instruction so we can check them for
+    // recursive simplifications after a RAUW. This is cheaper than checking all
+    // uses of To on the recursive step in most cases.
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
+         ++UI)
+      Worklist.insert(cast<Instruction>(*UI));
 
-    assert(ToHandle && "To value deleted by recursive simplification?");
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
 
-    // If the recursive simplification ended up revisiting and deleting
-    // 'From' then we're done.
-    if (From == 0)
-      return;
+    // Gracefully handle edge cases where the instruction is not wired into any
+    // parent block.
+    if (I->getParent())
+      I->eraseFromParent();
   }
+  return Simplified;
+}
 
-  // If 'From' has value handles referring to it, do a real RAUW to update them.
-  From->replaceAllUsesWith(To);
+bool llvm::recursivelySimplifyInstruction(Instruction *I,
+                                          const TargetData *TD,
+                                          const TargetLibraryInfo *TLI,
+                                          const DominatorTree *DT) {
+  return replaceAndRecursivelySimplifyImpl(I, 0, TD, TLI, DT);
+}
 
-  From->eraseFromParent();
+bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
+                                         const TargetData *TD,
+                                         const TargetLibraryInfo *TLI,
+                                         const DominatorTree *DT) {
+  assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!");
+  assert(SimpleV && "Must provide a simplified value.");
+  return replaceAndRecursivelySimplifyImpl(I, SimpleV, TD, TLI, DT);
 }
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 971065f..83bdf52 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -416,9 +416,8 @@ void Lint::visitMemoryReference(Instruction &I,
 
     if (Align != 0) {
       unsigned BitWidth = TD->getTypeSizeInBits(Ptr->getType());
-      APInt Mask = APInt::getAllOnesValue(BitWidth),
-                   KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      ComputeMaskedBits(Ptr, Mask, KnownZero, KnownOne, TD);
+      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+      ComputeMaskedBits(Ptr, KnownZero, KnownOne, TD);
       Assert1(!(KnownOne & APInt::getLowBitsSet(BitWidth, Log2_32(Align))),
               "Undefined behavior: Memory reference address is misaligned", &I);
     }
@@ -476,9 +475,8 @@ static bool isZero(Value *V, TargetData *TD) {
   if (isa<UndefValue>(V)) return true;
 
   unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
-  APInt Mask = APInt::getAllOnesValue(BitWidth),
-               KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD);
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD);
   return KnownZero.isAllOnesValue();
 }
 
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 858cc64..f7a60a1 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -205,6 +205,17 @@ bool Loop::isLoopSimplifyForm() const {
   return getLoopPreheader() && getLoopLatch() && hasDedicatedExits();
 }
 
+/// isSafeToClone - Return true if the loop body is safe to clone in practice.
+/// Routines that reform the loop CFG and split edges often fail on indirectbr.
+bool Loop::isSafeToClone() const {
+  // Return false if any loop blocks contain indirectbrs.
+  for (Loop::block_iterator I = block_begin(), E = block_end(); I != E; ++I) {
+    if (isa<IndirectBrInst>((*I)->getTerminator()))
+      return false;
+  }
+  return true;
+}
+
 /// hasDedicatedExits - Return true if no exit block for the loop
 /// has a predecessor that is outside the loop.
 bool Loop::hasDedicatedExits() const {
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 5ba1f40..aba700a 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -14,10 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/DebugInfoProbe.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Timer.h"
 using namespace llvm;
 
@@ -54,20 +52,6 @@ char PrintLoopPass::ID = 0;
 }
 
 //===----------------------------------------------------------------------===//
-// DebugInfoProbe
-
-static DebugInfoProbeInfo *TheDebugProbe;
-static void createDebugInfoProbe() {
-  if (TheDebugProbe) return;
-
-  // Constructed the first time this is called. This guarantees that the
-  // object will be constructed, if -enable-debug-info-probe is set,
-  // before static globals, thus it will be destroyed before them.
-  static ManagedStatic<DebugInfoProbeInfo> DIP;
-  TheDebugProbe = &*DIP;
-}
-
-//===----------------------------------------------------------------------===//
 // LPPassManager
 //
 
@@ -195,7 +179,6 @@ void LPPassManager::getAnalysisUsage(AnalysisUsage &Info) const {
 bool LPPassManager::runOnFunction(Function &F) {
   LI = &getAnalysis<LoopInfo>();
   bool Changed = false;
-  createDebugInfoProbe();
 
   // Collect inherited analysis from Module level pass manager.
   populateInheritedAnalysis(TPM->activeStack);
@@ -227,21 +210,19 @@ bool LPPassManager::runOnFunction(Function &F) {
     // Run all passes on the current Loop.
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       LoopPass *P = getContainedPass(Index);
+
       dumpPassInfo(P, EXECUTION_MSG, ON_LOOP_MSG,
                    CurrentLoop->getHeader()->getName());
       dumpRequiredSet(P);
 
       initializeAnalysisImpl(P);
-      if (TheDebugProbe)
-        TheDebugProbe->initialize(P, F);
+
       {
         PassManagerPrettyStackEntry X(P, *CurrentLoop->getHeader());
         TimeRegion PassTimer(getPassTimer(P));
 
         Changed |= P->runOnLoop(CurrentLoop, *this);
       }
-      if (TheDebugProbe)
-        TheDebugProbe->finalize(P, F);
 
       if (Changed)
         dumpPassInfo(P, MODIFICATION_MSG, ON_LOOP_MSG,
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 0c0ceeb..205227c 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -3261,9 +3261,8 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
     // For a SCEVUnknown, ask ValueTracking.
     unsigned BitWidth = getTypeSizeInBits(U->getType());
-    APInt Mask = APInt::getAllOnesValue(BitWidth);
     APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    ComputeMaskedBits(U->getValue(), Mask, Zeros, Ones);
+    ComputeMaskedBits(U->getValue(), Zeros, Ones);
     return Zeros.countTrailingOnes();
   }
 
@@ -3401,9 +3400,8 @@ ScalarEvolution::getUnsignedRange(const SCEV *S) {
 
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
     // For a SCEVUnknown, ask ValueTracking.
-    APInt Mask = APInt::getAllOnesValue(BitWidth);
     APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    ComputeMaskedBits(U->getValue(), Mask, Zeros, Ones, TD);
+    ComputeMaskedBits(U->getValue(), Zeros, Ones, TD);
     if (Ones == ~Zeros + 1)
       return setUnsignedRange(U, ConservativeResult);
     return setUnsignedRange(U,
@@ -3660,9 +3658,8 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       // knew about to reconstruct a low-bits mask value.
       unsigned LZ = A.countLeadingZeros();
       unsigned BitWidth = A.getBitWidth();
-      APInt AllOnes = APInt::getAllOnesValue(BitWidth);
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      ComputeMaskedBits(U->getOperand(0), AllOnes, KnownZero, KnownOne, TD);
+      ComputeMaskedBits(U->getOperand(0), KnownZero, KnownOne, TD);
 
       APInt EffectiveMask = APInt::getLowBitsSet(BitWidth, BitWidth - LZ);
 
@@ -4619,6 +4616,10 @@ ScalarEvolution::ComputeLoadConstantCompareExitLimit(
       Indexes.push_back(0);
     }
 
+  // Loop-invariant loads may be a byproduct of loop optimization. Skip them.
+  if (!VarIdx)
+    return getCouldNotCompute();
+
   // Okay, we know we have a (load (gep GV, 0, X)) comparison with a constant.
   // Check to see if X is a loop variant variable value now.
   const SCEV *Idx = getSCEV(VarIdx);
@@ -6845,7 +6846,7 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) {
     return ProperlyDominatesBlock;
   case scCouldNotCompute:
     llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
-  default: 
+  default:
     llvm_unreachable("Unknown SCEV kind!");
   }
 }
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 01e00ca..a430f62 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -20,8 +20,10 @@
 #include "llvm/GlobalAlias.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
 #include "llvm/Operator.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/PatternMatch.h"
@@ -42,7 +44,6 @@ static unsigned getBitWidth(Type *Ty, const TargetData *TD) {
 }
 
 static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
-                                    const APInt &Mask,
                                     APInt &KnownZero, APInt &KnownOne,
                                     APInt &KnownZero2, APInt &KnownOne2,
                                     const TargetData *TD, unsigned Depth) {
@@ -52,11 +53,11 @@ static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
       // than C (i.e. no wrap-around can happen).  For example, 20-X is
       // positive if we can prove that X is >= 0 and < 16.
       if (!CLHS->getValue().isNegative()) {
-        unsigned BitWidth = Mask.getBitWidth();
+        unsigned BitWidth = KnownZero.getBitWidth();
         unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
         // NLZ can't be BitWidth with no sign bit
         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        llvm::ComputeMaskedBits(Op1, MaskV, KnownZero2, KnownOne2, TD, Depth+1);
+        llvm::ComputeMaskedBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
     
         // If all of the MaskV bits are known to be zero, then we know the
         // output top bits are zero, because we now know that the output is
@@ -64,27 +65,25 @@ static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
         if ((KnownZero2 & MaskV) == MaskV) {
           unsigned NLZ2 = CLHS->getValue().countLeadingZeros();
           // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
         }
       }
     }
   }
 
-  unsigned BitWidth = Mask.getBitWidth();
+  unsigned BitWidth = KnownZero.getBitWidth();
 
   // If one of the operands has trailing zeros, then the bits that the
   // other operand has in those bit positions will be preserved in the
   // result. For an add, this works with either operand. For a subtract,
   // this only works if the known zeros are in the right operand.
   APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-  APInt Mask2 = APInt::getLowBitsSet(BitWidth,
-                                     BitWidth - Mask.countLeadingZeros());
-  llvm::ComputeMaskedBits(Op0, Mask2, LHSKnownZero, LHSKnownOne, TD, Depth+1);
+  llvm::ComputeMaskedBits(Op0, LHSKnownZero, LHSKnownOne, TD, Depth+1);
   assert((LHSKnownZero & LHSKnownOne) == 0 &&
          "Bits known to be one AND zero?");
   unsigned LHSKnownZeroOut = LHSKnownZero.countTrailingOnes();
 
-  llvm::ComputeMaskedBits(Op1, Mask2, KnownZero2, KnownOne2, TD, Depth+1);
+  llvm::ComputeMaskedBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
   assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
   unsigned RHSKnownZeroOut = KnownZero2.countTrailingOnes();
 
@@ -109,7 +108,7 @@ static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
   }
 
   // Are we still trying to solve for the sign bit?
-  if (Mask.isNegative() && !KnownZero.isNegative() && !KnownOne.isNegative()) {
+  if (!KnownZero.isNegative() && !KnownOne.isNegative()) {
     if (NSW) {
       if (Add) {
         // Adding two positive numbers can't wrap into negative
@@ -131,21 +130,19 @@ static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
 }
 
 static void ComputeMaskedBitsMul(Value *Op0, Value *Op1, bool NSW,
-                                 const APInt &Mask,
                                  APInt &KnownZero, APInt &KnownOne,
                                  APInt &KnownZero2, APInt &KnownOne2,
                                  const TargetData *TD, unsigned Depth) {
-  unsigned BitWidth = Mask.getBitWidth();
-  APInt Mask2 = APInt::getAllOnesValue(BitWidth);
-  ComputeMaskedBits(Op1, Mask2, KnownZero, KnownOne, TD, Depth+1);
-  ComputeMaskedBits(Op0, Mask2, KnownZero2, KnownOne2, TD, Depth+1);
+  unsigned BitWidth = KnownZero.getBitWidth();
+  ComputeMaskedBits(Op1, KnownZero, KnownOne, TD, Depth+1);
+  ComputeMaskedBits(Op0, KnownZero2, KnownOne2, TD, Depth+1);
   assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
   assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
   bool isKnownNegative = false;
   bool isKnownNonNegative = false;
   // If the multiplication is known not to overflow, compute the sign bit.
-  if (Mask.isNegative() && NSW) {
+  if (NSW) {
     if (Op0 == Op1) {
       // The product of a number with itself is non-negative.
       isKnownNonNegative = true;
@@ -182,7 +179,6 @@ static void ComputeMaskedBitsMul(Value *Op0, Value *Op1, bool NSW,
   LeadZ = std::min(LeadZ, BitWidth);
   KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
               APInt::getHighBitsSet(BitWidth, LeadZ);
-  KnownZero &= Mask;
 
   // Only make use of no-wrap flags if we failed to compute the sign bit
   // directly.  This matters if the multiplication always overflows, in
@@ -195,10 +191,28 @@ static void ComputeMaskedBitsMul(Value *Op0, Value *Op1, bool NSW,
     KnownOne.setBit(BitWidth - 1);
 }
 
-/// ComputeMaskedBits - Determine which of the bits specified in Mask are
-/// known to be either zero or one and return them in the KnownZero/KnownOne
-/// bit sets.  This code only analyzes bits in Mask, in order to short-circuit
-/// processing.
+void llvm::computeMaskedBitsLoad(const MDNode &Ranges, APInt &KnownZero) {
+  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned NumRanges = Ranges.getNumOperands() / 2;
+  assert(NumRanges >= 1);
+
+  // Use the high end of the ranges to find leading zeros.
+  unsigned MinLeadingZeros = BitWidth;
+  for (unsigned i = 0; i < NumRanges; ++i) {
+    ConstantInt *Lower = cast<ConstantInt>(Ranges.getOperand(2*i + 0));
+    ConstantInt *Upper = cast<ConstantInt>(Ranges.getOperand(2*i + 1));
+    ConstantRange Range(Lower->getValue(), Upper->getValue());
+    if (Range.isWrappedSet())
+      MinLeadingZeros = 0; // -1 has no zeros
+    unsigned LeadingZeros = (Upper->getValue() - 1).countLeadingZeros();
+    MinLeadingZeros = std::min(LeadingZeros, MinLeadingZeros);
+  }
+
+  KnownZero = APInt::getHighBitsSet(BitWidth, MinLeadingZeros);
+}
+/// ComputeMaskedBits - Determine which of the bits are known to be either zero
+/// or one and return them in the KnownZero/KnownOne bit sets.
+///
 /// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
 /// we cannot optimize based on the assumption that it is zero without changing
 /// it to be an explicit zero.  If we don't change it to zero, other code could
@@ -208,15 +222,15 @@ static void ComputeMaskedBitsMul(Value *Op0, Value *Op1, bool NSW,
 ///
 /// This function is defined on values with integer type, values with pointer
 /// type (but only if TD is non-null), and vectors of integers.  In the case
-/// where V is a vector, the mask, known zero, and known one values are the
+/// where V is a vector, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
-void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
-                             APInt &KnownZero, APInt &KnownOne,
+void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
                              const TargetData *TD, unsigned Depth) {
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
-  unsigned BitWidth = Mask.getBitWidth();
+  unsigned BitWidth = KnownZero.getBitWidth();
+
   assert((V->getType()->isIntOrIntVectorTy() ||
           V->getType()->getScalarType()->isPointerTy()) &&
          "Not integer or pointer type!");
@@ -230,15 +244,15 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     // We know all of the bits for a constant!
-    KnownOne = CI->getValue() & Mask;
-    KnownZero = ~KnownOne & Mask;
+    KnownOne = CI->getValue();
+    KnownZero = ~KnownOne;
     return;
   }
   // Null and aggregate-zero are all-zeros.
   if (isa<ConstantPointerNull>(V) ||
       isa<ConstantAggregateZero>(V)) {
     KnownOne.clearAllBits();
-    KnownZero = Mask;
+    KnownZero = APInt::getAllOnesValue(BitWidth);
     return;
   }
   // Handle a constant vector by taking the intersection of the known bits of
@@ -275,8 +289,8 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
       }
     }
     if (Align > 0)
-      KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
-                                              CountTrailingZeros_32(Align));
+      KnownZero = APInt::getLowBitsSet(BitWidth,
+                                       CountTrailingZeros_32(Align));
     else
       KnownZero.clearAllBits();
     KnownOne.clearAllBits();
@@ -288,8 +302,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     if (GA->mayBeOverridden()) {
       KnownZero.clearAllBits(); KnownOne.clearAllBits();
     } else {
-      ComputeMaskedBits(GA->getAliasee(), Mask, KnownZero, KnownOne,
-                        TD, Depth+1);
+      ComputeMaskedBits(GA->getAliasee(), KnownZero, KnownOne, TD, Depth+1);
     }
     return;
   }
@@ -298,15 +311,15 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     // Get alignment information off byval arguments if specified in the IR.
     if (A->hasByValAttr())
       if (unsigned Align = A->getParamAlignment())
-        KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
-                                                CountTrailingZeros_32(Align));
+        KnownZero = APInt::getLowBitsSet(BitWidth,
+                                         CountTrailingZeros_32(Align));
     return;
   }
 
   // Start out not knowing anything.
   KnownZero.clearAllBits(); KnownOne.clearAllBits();
 
-  if (Depth == MaxDepth || Mask == 0)
+  if (Depth == MaxDepth)
     return;  // Limit search depth.
 
   Operator *I = dyn_cast<Operator>(V);
@@ -315,12 +328,14 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
   APInt KnownZero2(KnownZero), KnownOne2(KnownOne);
   switch (I->getOpcode()) {
   default: break;
+  case Instruction::Load:
+    if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
+      computeMaskedBitsLoad(*MD, KnownZero);
+    return;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
-    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, TD, Depth+1);
-    APInt Mask2(Mask & ~KnownZero);
-    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, TD,
-                      Depth+1);
+    ComputeMaskedBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
     
@@ -331,10 +346,8 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     return;
   }
   case Instruction::Or: {
-    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, TD, Depth+1);
-    APInt Mask2(Mask & ~KnownOne);
-    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, TD,
-                      Depth+1);
+    ComputeMaskedBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
     
@@ -345,9 +358,8 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     return;
   }
   case Instruction::Xor: {
-    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, TD, Depth+1);
-    ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, TD,
-                      Depth+1);
+    ComputeMaskedBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
     
@@ -361,34 +373,30 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
   case Instruction::Mul: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     ComputeMaskedBitsMul(I->getOperand(0), I->getOperand(1), NSW,
-                         Mask, KnownZero, KnownOne, KnownZero2, KnownOne2,
-                         TD, Depth);
+                         KnownZero, KnownOne, KnownZero2, KnownOne2, TD, Depth);
     break;
   }
   case Instruction::UDiv: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
-    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-    ComputeMaskedBits(I->getOperand(0),
-                      AllOnes, KnownZero2, KnownOne2, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
     unsigned LeadZ = KnownZero2.countLeadingOnes();
 
     KnownOne2.clearAllBits();
     KnownZero2.clearAllBits();
-    ComputeMaskedBits(I->getOperand(1),
-                      AllOnes, KnownZero2, KnownOne2, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
     unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
     if (RHSUnknownLeadingOnes != BitWidth)
       LeadZ = std::min(BitWidth,
                        LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
 
-    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ) & Mask;
+    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
     return;
   }
   case Instruction::Select:
-    ComputeMaskedBits(I->getOperand(2), Mask, KnownZero, KnownOne, TD, Depth+1);
-    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero2, KnownOne2, TD,
+    ComputeMaskedBits(I->getOperand(2), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(1), KnownZero2, KnownOne2, TD,
                       Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
@@ -421,11 +429,9 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     else
       SrcBitWidth = SrcTy->getScalarSizeInBits();
     
-    APInt MaskIn = Mask.zextOrTrunc(SrcBitWidth);
     KnownZero = KnownZero.zextOrTrunc(SrcBitWidth);
     KnownOne = KnownOne.zextOrTrunc(SrcBitWidth);
-    ComputeMaskedBits(I->getOperand(0), MaskIn, KnownZero, KnownOne, TD,
-                      Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
     KnownZero = KnownZero.zextOrTrunc(BitWidth);
     KnownOne = KnownOne.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
@@ -439,8 +445,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
         !I->getType()->isVectorTy()) {
-      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, TD,
-                        Depth+1);
+      ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
       return;
     }
     break;
@@ -449,11 +454,9 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     // Compute the bits in the result that are not present in the input.
     unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
       
-    APInt MaskIn = Mask.trunc(SrcBitWidth);
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    ComputeMaskedBits(I->getOperand(0), MaskIn, KnownZero, KnownOne, TD,
-                      Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
@@ -470,9 +473,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
-      APInt Mask2(Mask.lshr(ShiftAmt));
-      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero, KnownOne, TD,
-                        Depth+1);
+      ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
       assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
       KnownZero <<= ShiftAmt;
       KnownOne  <<= ShiftAmt;
@@ -487,9 +488,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
       
       // Unsigned shift right.
-      APInt Mask2(Mask.shl(ShiftAmt));
-      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero,KnownOne, TD,
-                        Depth+1);
+      ComputeMaskedBits(I->getOperand(0), KnownZero,KnownOne, TD, Depth+1);
       assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
       KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
       KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
@@ -505,9 +504,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
       
       // Signed shift right.
-      APInt Mask2(Mask.shl(ShiftAmt));
-      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero, KnownOne, TD,
-                        Depth+1);
+      ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
       assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
       KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
       KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
@@ -523,15 +520,15 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
   case Instruction::Sub: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     ComputeMaskedBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
-                            Mask, KnownZero, KnownOne, KnownZero2, KnownOne2,
-                            TD, Depth);
+                            KnownZero, KnownOne, KnownZero2, KnownOne2, TD,
+                            Depth);
     break;
   }
   case Instruction::Add: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     ComputeMaskedBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
-                            Mask, KnownZero, KnownOne, KnownZero2, KnownOne2,
-                            TD, Depth);
+                            KnownZero, KnownOne, KnownZero2, KnownOne2, TD,
+                            Depth);
     break;
   }
   case Instruction::SRem:
@@ -539,9 +536,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
       APInt RA = Rem->getValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
-        ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, TD, 
-                          Depth+1);
+        ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
 
         // The low bits of the first operand are unchanged by the srem.
         KnownZero = KnownZero2 & LowBits;
@@ -557,19 +552,15 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
         if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
           KnownOne |= ~LowBits;
 
-        KnownZero &= Mask;
-        KnownOne &= Mask;
-
         assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
       }
     }
 
     // The sign bit is the LHS's sign bit, except when the result of the
     // remainder is zero.
-    if (Mask.isNegative() && KnownZero.isNonNegative()) {
-      APInt Mask2 = APInt::getSignBit(BitWidth);
+    if (KnownZero.isNonNegative()) {
       APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      ComputeMaskedBits(I->getOperand(0), Mask2, LHSKnownZero, LHSKnownOne, TD,
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, TD,
                         Depth+1);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
@@ -582,27 +573,24 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
       APInt RA = Rem->getValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
-        APInt Mask2 = LowBits & Mask;
-        KnownZero |= ~LowBits & Mask;
-        ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero, KnownOne, TD,
+        ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD,
                           Depth+1);
         assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+        KnownZero |= ~LowBits;
+        KnownOne &= LowBits;
         break;
       }
     }
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
-    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-    ComputeMaskedBits(I->getOperand(0), AllOnes, KnownZero, KnownOne,
-                      TD, Depth+1);
-    ComputeMaskedBits(I->getOperand(1), AllOnes, KnownZero2, KnownOne2,
-                      TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
 
     unsigned Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
     KnownOne.clearAllBits();
-    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & Mask;
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
     break;
   }
 
@@ -613,17 +601,15 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
       Align = TD->getABITypeAlignment(AI->getType()->getElementType());
     
     if (Align > 0)
-      KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
-                                              CountTrailingZeros_32(Align));
+      KnownZero = APInt::getLowBitsSet(BitWidth, CountTrailingZeros_32(Align));
     break;
   }
   case Instruction::GetElementPtr: {
     // Analyze all of the subscripts of this getelementptr instruction
     // to determine if we can prove known low zero bits.
-    APInt LocalMask = APInt::getAllOnesValue(BitWidth);
     APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0);
-    ComputeMaskedBits(I->getOperand(0), LocalMask,
-                      LocalKnownZero, LocalKnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, TD,
+                      Depth+1);
     unsigned TrailZ = LocalKnownZero.countTrailingOnes();
 
     gep_type_iterator GTI = gep_type_begin(I);
@@ -643,17 +629,15 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
         if (!IndexedTy->isSized()) return;
         unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
         uint64_t TypeSize = TD ? TD->getTypeAllocSize(IndexedTy) : 1;
-        LocalMask = APInt::getAllOnesValue(GEPOpiBits);
         LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0);
-        ComputeMaskedBits(Index, LocalMask,
-                          LocalKnownZero, LocalKnownOne, TD, Depth+1);
+        ComputeMaskedBits(Index, LocalKnownZero, LocalKnownOne, TD, Depth+1);
         TrailZ = std::min(TrailZ,
                           unsigned(CountTrailingZeros_64(TypeSize) +
                                    LocalKnownZero.countTrailingOnes()));
       }
     }
     
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) & Mask;
+    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ);
     break;
   }
   case Instruction::PHI: {
@@ -688,17 +672,13 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
             break;
           // Ok, we have a PHI of the form L op= R. Check for low
           // zero bits.
-          APInt Mask2 = APInt::getAllOnesValue(BitWidth);
-          ComputeMaskedBits(R, Mask2, KnownZero2, KnownOne2, TD, Depth+1);
-          Mask2 = APInt::getLowBitsSet(BitWidth,
-                                       KnownZero2.countTrailingOnes());
+          ComputeMaskedBits(R, KnownZero2, KnownOne2, TD, Depth+1);
 
           // We need to take the minimum number of known bits
           APInt KnownZero3(KnownZero), KnownOne3(KnownOne);
-          ComputeMaskedBits(L, Mask2, KnownZero3, KnownOne3, TD, Depth+1);
+          ComputeMaskedBits(L, KnownZero3, KnownOne3, TD, Depth+1);
 
-          KnownZero = Mask &
-                      APInt::getLowBitsSet(BitWidth,
+          KnownZero = APInt::getLowBitsSet(BitWidth,
                                            std::min(KnownZero2.countTrailingOnes(),
                                                     KnownZero3.countTrailingOnes()));
           break;
@@ -717,8 +697,8 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
       if (P->hasConstantValue() == P)
         break;
 
-      KnownZero = Mask;
-      KnownOne = Mask;
+      KnownZero = APInt::getAllOnesValue(BitWidth);
+      KnownOne = APInt::getAllOnesValue(BitWidth);
       for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
         // Skip direct self references.
         if (P->getIncomingValue(i) == P) continue;
@@ -727,8 +707,8 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
         KnownOne2 = APInt(BitWidth, 0);
         // Recurse, but cap the recursion to one level, because we don't
         // want to waste time spinning around in loops.
-        ComputeMaskedBits(P->getIncomingValue(i), KnownZero | KnownOne,
-                          KnownZero2, KnownOne2, TD, MaxDepth-1);
+        ComputeMaskedBits(P->getIncomingValue(i), KnownZero2, KnownOne2, TD,
+                          MaxDepth-1);
         KnownZero &= KnownZero2;
         KnownOne &= KnownOne2;
         // If all bits have been ruled out, there's no need to check
@@ -749,17 +729,17 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           LowBits -= 1;
-        KnownZero = Mask & APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
       case Intrinsic::ctpop: {
         unsigned LowBits = Log2_32(BitWidth)+1;
-        KnownZero = Mask & APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
       case Intrinsic::x86_sse42_crc32_64_8:
       case Intrinsic::x86_sse42_crc32_64_64:
-        KnownZero = Mask & APInt::getHighBitsSet(64, 32);
+        KnownZero = APInt::getHighBitsSet(64, 32);
         break;
       }
     }
@@ -774,21 +754,19 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
         case Intrinsic::uadd_with_overflow:
         case Intrinsic::sadd_with_overflow:
           ComputeMaskedBitsAddSub(true, II->getArgOperand(0),
-                                  II->getArgOperand(1), false, Mask,
-                                  KnownZero, KnownOne, KnownZero2, KnownOne2,
-                                  TD, Depth);
+                                  II->getArgOperand(1), false, KnownZero,
+                                  KnownOne, KnownZero2, KnownOne2, TD, Depth);
           break;
         case Intrinsic::usub_with_overflow:
         case Intrinsic::ssub_with_overflow:
           ComputeMaskedBitsAddSub(false, II->getArgOperand(0),
-                                  II->getArgOperand(1), false, Mask,
-                                  KnownZero, KnownOne, KnownZero2, KnownOne2,
-                                  TD, Depth);
+                                  II->getArgOperand(1), false, KnownZero,
+                                  KnownOne, KnownZero2, KnownOne2, TD, Depth);
           break;
         case Intrinsic::umul_with_overflow:
         case Intrinsic::smul_with_overflow:
           ComputeMaskedBitsMul(II->getArgOperand(0), II->getArgOperand(1),
-                               false, Mask, KnownZero, KnownOne,
+                               false, KnownZero, KnownOne,
                                KnownZero2, KnownOne2, TD, Depth);
           break;
         }
@@ -809,8 +787,7 @@ void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
   }
   APInt ZeroBits(BitWidth, 0);
   APInt OneBits(BitWidth, 0);
-  ComputeMaskedBits(V, APInt::getSignBit(BitWidth), ZeroBits, OneBits, TD,
-                    Depth);
+  ComputeMaskedBits(V, ZeroBits, OneBits, TD, Depth);
   KnownOne = OneBits[BitWidth - 1];
   KnownZero = ZeroBits[BitWidth - 1];
 }
@@ -918,7 +895,7 @@ bool llvm::isKnownNonZero(Value *V, const TargetData *TD, unsigned Depth) {
 
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(X, APInt(BitWidth, 1), KnownZero, KnownOne, TD, Depth);
+    ComputeMaskedBits(X, KnownZero, KnownOne, TD, Depth);
     if (KnownOne[0])
       return true;
   }
@@ -960,12 +937,12 @@ bool llvm::isKnownNonZero(Value *V, const TargetData *TD, unsigned Depth) {
       APInt Mask = APInt::getSignedMaxValue(BitWidth);
       // The sign bit of X is set.  If some other bit is set then X is not equal
       // to INT_MIN.
-      ComputeMaskedBits(X, Mask, KnownZero, KnownOne, TD, Depth);
+      ComputeMaskedBits(X, KnownZero, KnownOne, TD, Depth);
       if ((KnownOne & Mask) != 0)
         return true;
       // The sign bit of Y is set.  If some other bit is set then Y is not equal
       // to INT_MIN.
-      ComputeMaskedBits(Y, Mask, KnownZero, KnownOne, TD, Depth);
+      ComputeMaskedBits(Y, KnownZero, KnownOne, TD, Depth);
       if ((KnownOne & Mask) != 0)
         return true;
     }
@@ -995,8 +972,7 @@ bool llvm::isKnownNonZero(Value *V, const TargetData *TD, unsigned Depth) {
   if (!BitWidth) return false;
   APInt KnownZero(BitWidth, 0);
   APInt KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, APInt::getAllOnesValue(BitWidth), KnownZero, KnownOne,
-                    TD, Depth);
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
   return KnownOne != 0;
 }
 
@@ -1012,7 +988,7 @@ bool llvm::isKnownNonZero(Value *V, const TargetData *TD, unsigned Depth) {
 bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
                              const TargetData *TD, unsigned Depth) {
   APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
-  ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth);
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
   assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
   return (KnownZero & Mask) == Mask;
 }
@@ -1103,13 +1079,11 @@ unsigned llvm::ComputeNumSignBits(Value *V, const TargetData *TD,
     if (ConstantInt *CRHS = dyn_cast<ConstantInt>(U->getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        APInt Mask = APInt::getAllOnesValue(TyBits);
-        ComputeMaskedBits(U->getOperand(0), Mask, KnownZero, KnownOne, TD,
-                          Depth+1);
+        ComputeMaskedBits(U->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
         
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((KnownZero | APInt(TyBits, 1)) == Mask)
+        if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue())
           return TyBits;
         
         // If we are subtracting one from a positive number, there is no carry
@@ -1130,12 +1104,10 @@ unsigned llvm::ComputeNumSignBits(Value *V, const TargetData *TD,
     if (ConstantInt *CLHS = dyn_cast<ConstantInt>(U->getOperand(0)))
       if (CLHS->isNullValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        APInt Mask = APInt::getAllOnesValue(TyBits);
-        ComputeMaskedBits(U->getOperand(1), Mask, KnownZero, KnownOne, 
-                          TD, Depth+1);
+        ComputeMaskedBits(U->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((KnownZero | APInt(TyBits, 1)) == Mask)
+        if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue())
           return TyBits;
         
         // If the input is known to be positive (the sign bit is known clear),
@@ -1177,8 +1149,8 @@ unsigned llvm::ComputeNumSignBits(Value *V, const TargetData *TD,
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
   APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-  APInt Mask = APInt::getAllOnesValue(TyBits);
-  ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth);
+  APInt Mask;
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
   
   if (KnownZero.isNegative()) {        // sign bit is 0
     Mask = KnownZero;
@@ -1870,8 +1842,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
       return false;
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(Op, APInt::getAllOnesValue(BitWidth),
-                      KnownZero, KnownOne, TD);
+    ComputeMaskedBits(Op, KnownZero, KnownOne, TD);
     return !!KnownZero;
   }
   case Instruction::Load: {
@@ -1883,6 +1854,14 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
   case Instruction::Call: {
    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
      switch (II->getIntrinsicID()) {
+       // These synthetic intrinsics have no side-effects, and just mark
+       // information about their operands.
+       // FIXME: There are other no-op synthetic instructions that potentially
+       // should be considered at least *safe* to speculate...
+       case Intrinsic::dbg_declare:
+       case Intrinsic::dbg_value:
+         return true;
+
        case Intrinsic::bswap:
        case Intrinsic::ctlz:
        case Intrinsic::ctpop:
diff --git a/lib/Archive/ArchiveReader.cpp b/lib/Archive/ArchiveReader.cpp
index a78c0f5..68873e2 100644
--- a/lib/Archive/ArchiveReader.cpp
+++ b/lib/Archive/ArchiveReader.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Module.h"
+#include <cstdio>
 #include <cstdlib>
 #include <memory>
 using namespace llvm;
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 14e14c3..00874d4 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -290,7 +290,7 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr,
 }
 
 bool llvm::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
-                                const TargetLowering &TLI) {
+                                SDValue &Chain, const TargetLowering &TLI) {
   const Function *F = DAG.getMachineFunction().getFunction();
 
   // Conservatively require the attributes of the call to match those of
@@ -304,5 +304,5 @@ bool llvm::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
     return false;
 
   // Check if the only use is a function return node.
-  return TLI.isUsedByReturnOnly(Node);
+  return TLI.isUsedByReturnOnly(Node, Chain);
 }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index dd3fb3b..f6cde98 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1781,7 +1781,9 @@ static void EmitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
   if (CFP->getType()->isFloatTy()) {
     if (AP.isVerbose()) {
       float Val = CFP->getValueAPF().convertToFloat();
-      AP.OutStreamer.GetCommentOS() << "float " << Val << '\n';
+      uint64_t IntVal = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+      AP.OutStreamer.GetCommentOS() << "float " << Val << '\n'
+                                    << " (" << format("0x%x", IntVal) << ")\n";
     }
     uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
     AP.OutStreamer.EmitIntValue(Val, 4, AddrSpace);
@@ -1793,7 +1795,9 @@ static void EmitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
   if (CFP->getType()->isDoubleTy()) {
     if (AP.isVerbose()) {
       double Val = CFP->getValueAPF().convertToDouble();
-      AP.OutStreamer.GetCommentOS() << "double " << Val << '\n';
+      uint64_t IntVal = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+      AP.OutStreamer.GetCommentOS() << "double " << Val << '\n'
+                                    << " (" << format("0x%lx", IntVal) << ")\n";
     }
 
     uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 89e6cd1..e9e9335 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -329,7 +329,11 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
           OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
         }
 
-        if (OpNo >= MI->getNumOperands()) {
+	// We may have a location metadata attached to the end of the
+	// instruction, and at no point should see metadata at any
+	// other point while processing. It's an error if so.
+        if (OpNo >= MI->getNumOperands() ||
+	    MI->getOperand(OpNo).isMetadata()) {
           Error = true;
         } else {
           unsigned OpFlags = MI->getOperand(OpNo).getImm();
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index 644eaad..454a923 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -11,14 +11,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "DwarfAccelTable.h"
+#include "DwarfDebug.h"
+#include "DIE.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
-#include "DwarfAccelTable.h"
-#include "DwarfDebug.h"
-#include "DIE.h"
 
 using namespace llvm;
 
@@ -34,44 +36,28 @@ const char *DwarfAccelTable::Atom::AtomTypeString(enum AtomType AT) {
   llvm_unreachable("invalid AtomType!");
 }
 
-// The general case would need to have a less hard coded size for the
-// length of the HeaderData, however, if we're constructing based on a
-// single Atom then we know it will always be: 4 + 4 + 2 + 2.
-DwarfAccelTable::DwarfAccelTable(DwarfAccelTable::Atom atom) :
-  Header(12),
-  HeaderData(atom) {
-}
-
 // The length of the header data is always going to be 4 + 4 + 4*NumAtoms.
-DwarfAccelTable::DwarfAccelTable(std::vector<DwarfAccelTable::Atom> &atomList) :
+DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList) :
   Header(8 + (atomList.size() * 4)),
-  HeaderData(atomList) {
-}
+  HeaderData(atomList),
+  Entries(Allocator) { }
 
-DwarfAccelTable::~DwarfAccelTable() {
-  for (size_t i = 0, e = Data.size(); i < e; ++i)
-    delete Data[i];
-  for (StringMap<DataArray>::iterator
-         EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI)
-    for (DataArray::iterator DI = EI->second.begin(),
-           DE = EI->second.end(); DI != DE; ++DI)
-      delete (*DI);
-}
+DwarfAccelTable::~DwarfAccelTable() { }
 
 void DwarfAccelTable::AddName(StringRef Name, DIE* die, char Flags) {
+  assert(Data.empty() && "Already finalized!");
   // If the string is in the list already then add this die to the list
   // otherwise add a new one.
   DataArray &DIEs = Entries[Name];
-  DIEs.push_back(new HashDataContents(die, Flags));
+  DIEs.push_back(new (Allocator) HashDataContents(die, Flags));
 }
 
 void DwarfAccelTable::ComputeBucketCount(void) {
   // First get the number of unique hashes.
-  std::vector<uint32_t> uniques;
-  uniques.resize(Data.size());
+  std::vector<uint32_t> uniques(Data.size());
   for (size_t i = 0, e = Data.size(); i < e; ++i)
     uniques[i] = Data[i]->HashValue;
-  std::stable_sort(uniques.begin(), uniques.end());
+  array_pod_sort(uniques.begin(), uniques.end());
   std::vector<uint32_t>::iterator p =
     std::unique(uniques.begin(), uniques.end());
   uint32_t num = std::distance(uniques.begin(), p);
@@ -84,31 +70,23 @@ void DwarfAccelTable::ComputeBucketCount(void) {
   Header.hashes_count = num;
 }
 
-namespace {
-  // DIESorter - comparison predicate that sorts DIEs by their offset.
-  struct DIESorter {
-    bool operator()(const struct DwarfAccelTable::HashDataContents *A,
-                    const struct DwarfAccelTable::HashDataContents *B) const {
-      return A->Die->getOffset() < B->Die->getOffset();
-    }
-  };
+// compareDIEs - comparison predicate that sorts DIEs by their offset.
+static bool compareDIEs(const DwarfAccelTable::HashDataContents *A,
+                        const DwarfAccelTable::HashDataContents *B) {
+  return A->Die->getOffset() < B->Die->getOffset();
 }
 
 void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, const char *Prefix) {
   // Create the individual hash data outputs.
   for (StringMap<DataArray>::iterator
          EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI) {
-    struct HashData *Entry = new HashData((*EI).getKeyData());
 
     // Unique the entries.
-    std::stable_sort(EI->second.begin(), EI->second.end(), DIESorter());
+    std::stable_sort(EI->second.begin(), EI->second.end(), compareDIEs);
     EI->second.erase(std::unique(EI->second.begin(), EI->second.end()),
                        EI->second.end());
 
-    for (DataArray::const_iterator DI = EI->second.begin(),
-           DE = EI->second.end();
-         DI != DE; ++DI)
-      Entry->addData((*DI));
+    HashData *Entry = new (Allocator) HashData(EI->getKey(), EI->second);
     Data.push_back(Entry);
   }
 
@@ -215,7 +193,7 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfDebug *D) {
                              D->getStringPool());
       Asm->OutStreamer.AddComment("Num DIEs");
       Asm->EmitInt32((*HI)->Data.size());
-      for (std::vector<struct HashDataContents*>::const_iterator
+      for (ArrayRef<HashDataContents*>::const_iterator
              DI = (*HI)->Data.begin(), DE = (*HI)->Data.end();
            DI != DE; ++DI) {
         // Emit the DIE offset
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 2278d4c..963b8cd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -15,6 +15,7 @@
 #define CODEGEN_ASMPRINTER_DWARFACCELTABLE_H__
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/DataTypes.h"
@@ -164,22 +165,12 @@ public:
 
  private:
   struct TableHeaderData {
-    
     uint32_t die_offset_base;
-    std::vector<Atom> Atoms;
+    SmallVector<Atom, 1> Atoms;
+
+    TableHeaderData(ArrayRef<Atom> AtomList, uint32_t offset = 0)
+      : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) { }
 
-    TableHeaderData(std::vector<DwarfAccelTable::Atom> &AtomList,
-                    uint32_t offset = 0) :
-      die_offset_base(offset) {
-      for (size_t i = 0, e = AtomList.size(); i != e; ++i)
-        Atoms.push_back(AtomList[i]);
-    }
-    
-    TableHeaderData(DwarfAccelTable::Atom Atom, uint32_t offset = 0)
-    : die_offset_base(offset) {
-      Atoms.push_back(Atom);
-    }
-    
 #ifndef NDEBUG
     void print (raw_ostream &O) {
       O << "die_offset_base: " << die_offset_base << "\n";
@@ -221,11 +212,11 @@ private:
     StringRef Str;
     uint32_t HashValue;
     MCSymbol *Sym;
-    std::vector<struct HashDataContents*> Data; // offsets
-    HashData(StringRef S) : Str(S) {
+    ArrayRef<HashDataContents*> Data; // offsets
+    HashData(StringRef S, ArrayRef<HashDataContents*> Data)
+      : Str(S), Data(Data) {
       HashValue = DwarfAccelTable::HashDJB(S);
     }
-    void addData(struct HashDataContents *Datum) { Data.push_back(Datum); }
     #ifndef NDEBUG
     void print(raw_ostream &O) {
       O << "Name: " << Str << "\n";
@@ -255,15 +246,18 @@ private:
   void EmitHashes(AsmPrinter *);
   void EmitOffsets(AsmPrinter *, MCSymbol *);
   void EmitData(AsmPrinter *, DwarfDebug *D);
-  
+
+  // Allocator for HashData and HashDataContents.
+  BumpPtrAllocator Allocator;
+
   // Output Variables
   TableHeader Header;
   TableHeaderData HeaderData;
   std::vector<HashData*> Data;
 
   // String Data
-  typedef std::vector<struct HashDataContents*> DataArray;
-  typedef StringMap<DataArray> StringEntries;
+  typedef std::vector<HashDataContents*> DataArray;
+  typedef StringMap<DataArray, BumpPtrAllocator&> StringEntries;
   StringEntries Entries;
 
   // Buckets/Hashes/Offsets
@@ -274,8 +268,7 @@ private:
   
   // Public Implementation
  public:
-  DwarfAccelTable(DwarfAccelTable::Atom);
-  DwarfAccelTable(std::vector<DwarfAccelTable::Atom> &);
+  DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom>);
   ~DwarfAccelTable();
   void AddName(StringRef, DIE*, char = 0);
   void FinalizeTable(AsmPrinter *, const char *);
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 3b383f6..cc5b642 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -188,6 +188,24 @@ void CompileUnit::addSourceLine(DIE *Die, DIType Ty) {
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
+void CompileUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
+  // Verify type.
+  if (!Ty.Verify())
+    return;
+
+  unsigned Line = Ty.getLineNumber();
+  if (Line == 0)
+    return;
+  DIFile File = Ty.getFile();
+  unsigned FileID = DD->GetOrCreateSourceID(File.getFilename(),
+					    File.getDirectory());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
 void CompileUnit::addSourceLine(DIE *Die, DINameSpace NS) {
   // Verify namespace.
   if (!NS.Verify())
@@ -628,7 +646,8 @@ DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
 }
 
 /// addType - Add a new type attribute to the specified entity.
-void CompileUnit::addType(DIE *Entity, DIType Ty) {
+void CompileUnit::addType(DIE *Entity, DIType Ty,
+			  unsigned Attribute) {
   if (!Ty.Verify())
     return;
 
@@ -636,7 +655,7 @@ void CompileUnit::addType(DIE *Entity, DIType Ty) {
   DIEEntry *Entry = getDIEEntry(Ty);
   // If it exists then use the existing value.
   if (Entry) {
-    Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry);
+    Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
     return;
   }
 
@@ -646,7 +665,7 @@ void CompileUnit::addType(DIE *Entity, DIType Ty) {
   // Set up proxy.
   Entry = createDIEEntry(Buffer);
   insertDIEEntry(Ty, Entry);
-  Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry);
+  Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
 
   // If this is a complete composite type then include it in the
   // list of global types.
@@ -826,13 +845,20 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         addUInt(ElemDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
         addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
         addSourceLine(ElemDie, DV);
-      } else if (Element.isDerivedType())
-        ElemDie = createMemberDIE(DIDerivedType(Element));
-      else if (Element.isObjCProperty()) {
+      } else if (Element.isDerivedType()) {
+	DIDerivedType DDTy(Element);
+	if (DDTy.getTag() == dwarf::DW_TAG_friend) {
+	  ElemDie = new DIE(dwarf::DW_TAG_friend);
+	  addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
+	} else
+	  ElemDie = createMemberDIE(DIDerivedType(Element));
+      } else if (Element.isObjCProperty()) {
         DIObjCProperty Property(Element);
         ElemDie = new DIE(Property.getTag());
         StringRef PropertyName = Property.getObjCPropertyName();
         addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
+	addType(ElemDie, Property.getType());
+	addSourceLine(ElemDie, Property);
         StringRef GetterName = Property.getObjCPropertyGetterName();
         if (!GetterName.empty())
           addString(ElemDie, dwarf::DW_AT_APPLE_property_getter, GetterName);
@@ -1006,9 +1032,10 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   // Add function template parameters.
   addTemplateParams(*SPDie, SP.getTemplateParams());
 
-  // Unfortunately this code needs to stay here to work around
-  // a bug in older gdbs that requires the linkage name to resolve
-  // multiple template functions.
+  // Unfortunately this code needs to stay here instead of below the
+  // AT_specification code in order to work around a bug in older
+  // gdbs that requires the linkage name to resolve multiple template
+  // functions.
   StringRef LinkageName = SP.getLinkageName();
   if (!LinkageName.empty())
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 4e63c3f..45e407e 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -213,6 +213,7 @@ public:
   void addSourceLine(DIE *Die, DISubprogram SP);
   void addSourceLine(DIE *Die, DIType Ty);
   void addSourceLine(DIE *Die, DINameSpace NS);
+  void addSourceLine(DIE *Die, DIObjCProperty Ty);
 
   /// addAddress - Add an address attribute to a die based on the location
   /// provided.
@@ -260,8 +261,10 @@ public:
   /// addToContextOwner - Add Die into the list of its context owner's children.
   void addToContextOwner(DIE *Die, DIDescriptor Context);
 
-  /// addType - Add a new type attribute to the specified entity.
-  void addType(DIE *Entity, DIType Ty);
+  /// addType - Add a new type attribute to the specified entity. This takes
+  /// and attribute parameter because DW_AT_friend attributes are also
+  /// type references.
+  void addType(DIE *Entity, DIType Ty, unsigned Attribute = dwarf::DW_AT_type);
 
   /// getOrCreateNameSpace - Create a DIE for DINameSpace.
   DIE *getOrCreateNameSpace(DINameSpace NS);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 388cef4..cb78878 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
 #include "llvm/Instructions.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -133,6 +134,11 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   DwarfStrSectionSym = TextSectionSym = 0;
   DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
+
+  // Turn on accelerator tables for Darwin.
+  if (Triple(M->getTargetTriple()).isOSDarwin())
+    DwarfAccelTables = true;
+  
   {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
     beginModule(M);
@@ -438,7 +444,8 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
     I->second.push_back(std::make_pair(StartLabel, ScopeDIE));
 
   DILocation DL(Scope->getInlinedAt());
-  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0, TheCU->getID());
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0,
+                 GetOrCreateSourceID(DL.getFilename(), DL.getDirectory()));
   TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, 0, DL.getLineNumber());
 
   // Add name to the name table, we do this here because we're guaranteed
@@ -554,9 +561,9 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
   NewCU->addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
                  DIUnit.getLanguage());
   NewCU->addString(Die, dwarf::DW_AT_name, FN);
-  // Use DW_AT_entry_pc instead of DW_AT_low_pc/DW_AT_high_pc pair. This
-  // simplifies debug range entries.
-  NewCU->addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_addr, 0);
+  // 2.17.1 requires that we use DW_AT_low_pc for a single entry point
+  // into an entity.
+  NewCU->addUInt(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section.
   if (Asm->MAI->doesDwarfRequireRelocationForSectionOffset())
@@ -1086,12 +1093,15 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   if (!MI->isDebugValue()) {
     DebugLoc DL = MI->getDebugLoc();
     if (DL != PrevInstLoc && (!DL.isUnknown() || UnknownLocations)) {
-      unsigned Flags = DWARF2_FLAG_IS_STMT;
+      unsigned Flags = 0;
       PrevInstLoc = DL;
       if (DL == PrologEndLoc) {
         Flags |= DWARF2_FLAG_PROLOGUE_END;
         PrologEndLoc = DebugLoc();
       }
+      if (PrologEndLoc.isUnknown())
+        Flags |= DWARF2_FLAG_IS_STMT;
+
       if (!DL.isUnknown()) {
         const MDNode *Scope = DL.getScope(Asm->MF->getFunction()->getContext());
         recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
@@ -1186,12 +1196,19 @@ static MDNode *getScopeNode(DebugLoc DL, const LLVMContext &Ctx) {
 }
 
 /// getFnDebugLoc - Walk up the scope chain of given debug loc and find
-/// line number  info for the function.
+/// line number info for the function.
 static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) {
   const MDNode *Scope = getScopeNode(DL, Ctx);
   DISubprogram SP = getDISubprogram(Scope);
-  if (SP.Verify()) 
-    return DebugLoc::get(SP.getLineNumber(), 0, SP);
+  if (SP.Verify()) {
+    // Check for number of operands since the compatibility is
+    // cheap here.
+    if (SP->getNumOperands() > 19)
+      return DebugLoc::get(SP.getScopeLineNumber(), 0, SP);
+    else
+      return DebugLoc::get(SP.getLineNumber(), 0, SP);
+  }
+
   return DebugLoc();
 }
 
@@ -1364,7 +1381,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
                                        MF->getFunction()->getContext());
     recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
                      FnStartDL.getScope(MF->getFunction()->getContext()),
-                     DWARF2_FLAG_IS_STMT);
+                     0);
   }
 }
 
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index f57f4a8..ef1d2ba 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -183,8 +184,14 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   TII = tii;
   TRI = tri;
   MMI = mmi;
-
-  RS = TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : NULL;
+  RS = NULL;
+
+  // Use a RegScavenger to help update liveness when required.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (MRI.tracksLiveness() && TRI->requiresRegisterScavenging(MF))
+    RS = new RegScavenger();
+  else
+    MRI.invalidateLiveness();
 
   // Fix CFG.  The later algorithms expect it to be right.
   bool MadeChange = false;
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index d596d8b..d5ea666 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -14,12 +14,12 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "Spiller.h"
-#include "LiveRangeEdit.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
@@ -655,7 +655,7 @@ void InlineSpiller::analyzeSiblingValues() {
         if (OrigVNI->def != VNI->def)
           DefMI = traceSiblingValue(Reg, VNI, OrigVNI);
       }
-      if (DefMI && Edit->checkRematerializable(VNI, DefMI, TII, AA)) {
+      if (DefMI && Edit->checkRematerializable(VNI, DefMI, AA)) {
         DEBUG(dbgs() << "Value " << PrintReg(Reg) << ':' << VNI->id << '@'
                      << VNI->def << " may remat from " << *DefMI);
       }
@@ -856,7 +856,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   SibValueMap::const_iterator SibI = SibValues.find(ParentVNI);
   if (SibI != SibValues.end())
     RM.OrigMI = SibI->second.DefMI;
-  if (!Edit->canRematerializeAt(RM, UseIdx, false, LIS)) {
+  if (!Edit->canRematerializeAt(RM, UseIdx, false)) {
     markValueUsed(&VirtReg, ParentVNI);
     DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << *MI);
     return false;
@@ -883,12 +883,12 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   }
 
   // Alocate a new register for the remat.
-  LiveInterval &NewLI = Edit->createFrom(Original, LIS, VRM);
+  LiveInterval &NewLI = Edit->createFrom(Original);
   NewLI.markNotSpillable();
 
   // Finally we can rematerialize OrigMI before MI.
   SlotIndex DefIdx = Edit->rematerializeAt(*MI->getParent(), MI, NewLI.reg, RM,
-                                           LIS, TII, TRI);
+                                           TRI);
   DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
                << *LIS.getInstructionFromIndex(DefIdx));
 
@@ -913,7 +913,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
 /// and trim the live ranges after.
 void InlineSpiller::reMaterializeAll() {
   // analyzeSiblingValues has already tested all relevant defining instructions.
-  if (!Edit->anyRematerializable(LIS, TII, AA))
+  if (!Edit->anyRematerializable(AA))
     return;
 
   UsedValues.clear();
@@ -954,7 +954,7 @@ void InlineSpiller::reMaterializeAll() {
   if (DeadDefs.empty())
     return;
   DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n");
-  Edit->eliminateDeadDefs(DeadDefs, LIS, VRM, TII, RegsToSpill);
+  Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
 
   // Get rid of deleted and empty intervals.
   for (unsigned i = RegsToSpill.size(); i != 0; --i) {
@@ -966,7 +966,7 @@ void InlineSpiller::reMaterializeAll() {
     LiveInterval &LI = LIS.getInterval(Reg);
     if (!LI.empty())
       continue;
-    Edit->eraseVirtReg(Reg, LIS);
+    Edit->eraseVirtReg(Reg);
     RegsToSpill.erase(RegsToSpill.begin() + (i - 1));
   }
   DEBUG(dbgs() << RegsToSpill.size() << " registers to spill after remat.\n");
@@ -1181,7 +1181,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
 
     // Allocate interval around instruction.
     // FIXME: Infer regclass from instruction alone.
-    LiveInterval &NewLI = Edit->createFrom(Reg, LIS, VRM);
+    LiveInterval &NewLI = Edit->createFrom(Reg);
     NewLI.markNotSpillable();
 
     if (RI.Reads)
@@ -1244,7 +1244,7 @@ void InlineSpiller::spillAll() {
   // Hoisted spills may cause dead code.
   if (!DeadDefs.empty()) {
     DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n");
-    Edit->eliminateDeadDefs(DeadDefs, LIS, VRM, TII, RegsToSpill);
+    Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
   }
 
   // Finally delete the SnippetCopies.
@@ -1260,7 +1260,7 @@ void InlineSpiller::spillAll() {
 
   // Delete all spilled registers.
   for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
-    Edit->eraseVirtReg(RegsToSpill[i], LIS);
+    Edit->eraseVirtReg(RegsToSpill[i]);
 }
 
 void InlineSpiller::spill(LiveRangeEdit &edit) {
@@ -1289,5 +1289,5 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
   if (!RegsToSpill.empty())
     spillAll();
 
-  Edit->calculateRegClassAndHint(MF, LIS, Loops);
+  Edit->calculateRegClassAndHint(MF, Loops);
 }
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 97e6547..a1f479a 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -172,6 +172,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   case CGFT_AssemblyFile: {
     MCInstPrinter *InstPrinter =
       getTarget().createMCInstPrinter(MAI.getAssemblerDialect(), MAI,
+                                      *getInstrInfo(),
                                       Context->getRegisterInfo(), STI);
 
     // Create a code emitter if asked to show the encoding.
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 3ade660..934cc12 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -1068,9 +1068,9 @@ public:
 
 #ifndef NDEBUG
     LIValidator validator;
-    std::for_each(Entering.begin(), Entering.end(), validator);
-    std::for_each(Internal.begin(), Internal.end(), validator);
-    std::for_each(Exiting.begin(), Exiting.end(), validator);
+    validator = std::for_each(Entering.begin(), Entering.end(), validator);
+    validator = std::for_each(Internal.begin(), Internal.end(), validator);
+    validator = std::for_each(Exiting.begin(), Exiting.end(), validator);
     assert(validator.rangesOk() && "moveAllOperandsFrom broke liveness.");
 #endif
 
@@ -1115,9 +1115,9 @@ public:
 
 #ifndef NDEBUG
     LIValidator validator;
-    std::for_each(Entering.begin(), Entering.end(), validator);
-    std::for_each(Internal.begin(), Internal.end(), validator);
-    std::for_each(Exiting.begin(), Exiting.end(), validator);
+    validator = std::for_each(Entering.begin(), Entering.end(), validator);
+    validator = std::for_each(Internal.begin(), Internal.end(), validator);
+    validator = std::for_each(Exiting.begin(), Exiting.end(), validator);
     assert(validator.rangesOk() && "moveAllOperandsInto broke liveness.");
 #endif
   }
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index f9b93d5..695f536 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
-#include "LiveRangeEdit.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
@@ -31,13 +31,12 @@ STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
 
 void LiveRangeEdit::Delegate::anchor() { }
 
-LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg,
-                                        LiveIntervals &LIS,
-                                        VirtRegMap &VRM) {
-  MachineRegisterInfo &MRI = VRM.getRegInfo();
+LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg) {
   unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
-  VRM.grow();
-  VRM.setIsSplitFromReg(VReg, VRM.getOriginal(OldReg));
+  if (VRM) {
+    VRM->grow();
+    VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
+  }
   LiveInterval &LI = LIS.getOrCreateInterval(VReg);
   newRegs_.push_back(&LI);
   return LI;
@@ -45,37 +44,32 @@ LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg,
 
 bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
                                           const MachineInstr *DefMI,
-                                          const TargetInstrInfo &tii,
                                           AliasAnalysis *aa) {
   assert(DefMI && "Missing instruction");
   scannedRemattable_ = true;
-  if (!tii.isTriviallyReMaterializable(DefMI, aa))
+  if (!TII.isTriviallyReMaterializable(DefMI, aa))
     return false;
   remattable_.insert(VNI);
   return true;
 }
 
-void LiveRangeEdit::scanRemattable(LiveIntervals &lis,
-                                   const TargetInstrInfo &tii,
-                                   AliasAnalysis *aa) {
+void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
   for (LiveInterval::vni_iterator I = parent_.vni_begin(),
        E = parent_.vni_end(); I != E; ++I) {
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    MachineInstr *DefMI = lis.getInstructionFromIndex(VNI->def);
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
     if (!DefMI)
       continue;
-    checkRematerializable(VNI, DefMI, tii, aa);
+    checkRematerializable(VNI, DefMI, aa);
   }
   scannedRemattable_ = true;
 }
 
-bool LiveRangeEdit::anyRematerializable(LiveIntervals &lis,
-                                        const TargetInstrInfo &tii,
-                                        AliasAnalysis *aa) {
+bool LiveRangeEdit::anyRematerializable(AliasAnalysis *aa) {
   if (!scannedRemattable_)
-    scanRemattable(lis, tii, aa);
+    scanRemattable(aa);
   return !remattable_.empty();
 }
 
@@ -83,8 +77,7 @@ bool LiveRangeEdit::anyRematerializable(LiveIntervals &lis,
 /// OrigIdx are also available with the same value at UseIdx.
 bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
                                        SlotIndex OrigIdx,
-                                       SlotIndex UseIdx,
-                                       LiveIntervals &lis) {
+                                       SlotIndex UseIdx) {
   OrigIdx = OrigIdx.getRegSlot(true);
   UseIdx = UseIdx.getRegSlot(true);
   for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
@@ -92,10 +85,10 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
     if (!MO.isReg() || !MO.getReg() || MO.isDef())
       continue;
     // Reserved registers are OK.
-    if (MO.isUndef() || !lis.hasInterval(MO.getReg()))
+    if (MO.isUndef() || !LIS.hasInterval(MO.getReg()))
       continue;
 
-    LiveInterval &li = lis.getInterval(MO.getReg());
+    LiveInterval &li = LIS.getInterval(MO.getReg());
     const VNInfo *OVNI = li.getVNInfoAt(OrigIdx);
     if (!OVNI)
       continue;
@@ -107,8 +100,7 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
 
 bool LiveRangeEdit::canRematerializeAt(Remat &RM,
                                        SlotIndex UseIdx,
-                                       bool cheapAsAMove,
-                                       LiveIntervals &lis) {
+                                       bool cheapAsAMove) {
   assert(scannedRemattable_ && "Call anyRematerializable first");
 
   // Use scanRemattable info.
@@ -118,10 +110,10 @@ bool LiveRangeEdit::canRematerializeAt(Remat &RM,
   // No defining instruction provided.
   SlotIndex DefIdx;
   if (RM.OrigMI)
-    DefIdx = lis.getInstructionIndex(RM.OrigMI);
+    DefIdx = LIS.getInstructionIndex(RM.OrigMI);
   else {
     DefIdx = RM.ParentVNI->def;
-    RM.OrigMI = lis.getInstructionFromIndex(DefIdx);
+    RM.OrigMI = LIS.getInstructionFromIndex(DefIdx);
     assert(RM.OrigMI && "No defining instruction for remattable value");
   }
 
@@ -130,7 +122,7 @@ bool LiveRangeEdit::canRematerializeAt(Remat &RM,
     return false;
 
   // Verify that all used registers are available with the same values.
-  if (!allUsesAvailableAt(RM.OrigMI, DefIdx, UseIdx, lis))
+  if (!allUsesAvailableAt(RM.OrigMI, DefIdx, UseIdx))
     return false;
 
   return true;
@@ -140,27 +132,22 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator MI,
                                          unsigned DestReg,
                                          const Remat &RM,
-                                         LiveIntervals &lis,
-                                         const TargetInstrInfo &tii,
                                          const TargetRegisterInfo &tri,
                                          bool Late) {
   assert(RM.OrigMI && "Invalid remat");
-  tii.reMaterialize(MBB, MI, DestReg, 0, RM.OrigMI, tri);
+  TII.reMaterialize(MBB, MI, DestReg, 0, RM.OrigMI, tri);
   rematted_.insert(RM.ParentVNI);
-  return lis.getSlotIndexes()->insertMachineInstrInMaps(--MI, Late)
+  return LIS.getSlotIndexes()->insertMachineInstrInMaps(--MI, Late)
            .getRegSlot();
 }
 
-void LiveRangeEdit::eraseVirtReg(unsigned Reg, LiveIntervals &LIS) {
+void LiveRangeEdit::eraseVirtReg(unsigned Reg) {
   if (delegate_ && delegate_->LRE_CanEraseVirtReg(Reg))
     LIS.removeInterval(Reg);
 }
 
 bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
-                               SmallVectorImpl<MachineInstr*> &Dead,
-                               MachineRegisterInfo &MRI,
-                               LiveIntervals &LIS,
-                               const TargetInstrInfo &TII) {
+                               SmallVectorImpl<MachineInstr*> &Dead) {
   MachineInstr *DefMI = 0, *UseMI = 0;
 
   // Check that there is a single def and a single use.
@@ -206,13 +193,10 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
 }
 
 void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                                      LiveIntervals &LIS, VirtRegMap &VRM,
-                                      const TargetInstrInfo &TII,
                                       ArrayRef<unsigned> RegsBeingSpilled) {
   SetVector<LiveInterval*,
             SmallVector<LiveInterval*, 8>,
             SmallPtrSet<LiveInterval*, 8> > ToShrink;
-  MachineRegisterInfo &MRI = VRM.getRegInfo();
 
   for (;;) {
     // Erase all dead defs.
@@ -263,7 +247,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
             LI.removeValNo(VNI);
             if (LI.empty()) {
               ToShrink.remove(&LI);
-              eraseVirtReg(Reg, LIS);
+              eraseVirtReg(Reg);
             }
           }
         }
@@ -282,7 +266,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     // Shrink just one live interval. Then delete new dead defs.
     LiveInterval *LI = ToShrink.back();
     ToShrink.pop_back();
-    if (foldAsLoad(LI, Dead, MRI, LIS, TII))
+    if (foldAsLoad(LI, Dead))
       continue;
     if (delegate_)
       delegate_->LRE_WillShrinkVirtReg(LI->reg);
@@ -302,7 +286,6 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     }
     
     if (BeingSpilled) continue;
-    
 
     // LI may have been separated, create new intervals.
     LI->RenumberValues(LIS);
@@ -311,16 +294,16 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     if (NumComp <= 1)
       continue;
     ++NumFracRanges;
-    bool IsOriginal = VRM.getOriginal(LI->reg) == LI->reg;
+    bool IsOriginal = VRM && VRM->getOriginal(LI->reg) == LI->reg;
     DEBUG(dbgs() << NumComp << " components: " << *LI << '\n');
     SmallVector<LiveInterval*, 8> Dups(1, LI);
     for (unsigned i = 1; i != NumComp; ++i) {
-      Dups.push_back(&createFrom(LI->reg, LIS, VRM));
+      Dups.push_back(&createFrom(LI->reg));
       // If LI is an original interval that hasn't been split yet, make the new
       // intervals their own originals instead of referring to LI. The original
       // interval must contain all the split products, and LI doesn't.
       if (IsOriginal)
-        VRM.setIsSplitFromReg(Dups.back()->reg, 0);
+        VRM->setIsSplitFromReg(Dups.back()->reg, 0);
       if (delegate_)
         delegate_->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg);
     }
@@ -329,10 +312,8 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
 }
 
 void LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
-                                             LiveIntervals &LIS,
                                              const MachineLoopInfo &Loops) {
   VirtRegAuxInfo VRAI(MF, LIS, Loops);
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   for (iterator I = begin(), E = end(); I != E; ++I) {
     LiveInterval &LI = **I;
     if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
diff --git a/lib/CodeGen/LiveRangeEdit.h b/lib/CodeGen/LiveRangeEdit.h
deleted file mode 100644
index 1148025..0000000
--- a/lib/CodeGen/LiveRangeEdit.h
+++ /dev/null
@@ -1,201 +0,0 @@
-//===---- LiveRangeEdit.h - Basic tools for split and spill -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The LiveRangeEdit class represents changes done to a virtual register when it
-// is spilled or split.
-//
-// The parent register is never changed. Instead, a number of new virtual
-// registers are created and added to the newRegs vector.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_LIVERANGEEDIT_H
-#define LLVM_CODEGEN_LIVERANGEEDIT_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/LiveInterval.h"
-
-namespace llvm {
-
-class AliasAnalysis;
-class LiveIntervals;
-class MachineLoopInfo;
-class MachineRegisterInfo;
-class VirtRegMap;
-
-class LiveRangeEdit {
-public:
-  /// Callback methods for LiveRangeEdit owners.
-  class Delegate {
-    virtual void anchor();
-  public:
-    /// Called immediately before erasing a dead machine instruction.
-    virtual void LRE_WillEraseInstruction(MachineInstr *MI) {}
-
-    /// Called when a virtual register is no longer used. Return false to defer
-    /// its deletion from LiveIntervals.
-    virtual bool LRE_CanEraseVirtReg(unsigned) { return true; }
-
-    /// Called before shrinking the live range of a virtual register.
-    virtual void LRE_WillShrinkVirtReg(unsigned) {}
-
-    /// Called after cloning a virtual register.
-    /// This is used for new registers representing connected components of Old.
-    virtual void LRE_DidCloneVirtReg(unsigned New, unsigned Old) {}
-
-    virtual ~Delegate() {}
-  };
-
-private:
-  LiveInterval &parent_;
-  SmallVectorImpl<LiveInterval*> &newRegs_;
-  Delegate *const delegate_;
-
-  /// firstNew_ - Index of the first register added to newRegs_.
-  const unsigned firstNew_;
-
-  /// scannedRemattable_ - true when remattable values have been identified.
-  bool scannedRemattable_;
-
-  /// remattable_ - Values defined by remattable instructions as identified by
-  /// tii.isTriviallyReMaterializable().
-  SmallPtrSet<const VNInfo*,4> remattable_;
-
-  /// rematted_ - Values that were actually rematted, and so need to have their
-  /// live range trimmed or entirely removed.
-  SmallPtrSet<const VNInfo*,4> rematted_;
-
-  /// scanRemattable - Identify the parent_ values that may rematerialize.
-  void scanRemattable(LiveIntervals &lis,
-                      const TargetInstrInfo &tii,
-                      AliasAnalysis *aa);
-
-  /// allUsesAvailableAt - Return true if all registers used by OrigMI at
-  /// OrigIdx are also available with the same value at UseIdx.
-  bool allUsesAvailableAt(const MachineInstr *OrigMI, SlotIndex OrigIdx,
-                          SlotIndex UseIdx, LiveIntervals &lis);
-
-  /// foldAsLoad - If LI has a single use and a single def that can be folded as
-  /// a load, eliminate the register by folding the def into the use.
-  bool foldAsLoad(LiveInterval *LI, SmallVectorImpl<MachineInstr*> &Dead,
-                  MachineRegisterInfo&, LiveIntervals&, const TargetInstrInfo&);
-
-public:
-  /// Create a LiveRangeEdit for breaking down parent into smaller pieces.
-  /// @param parent The register being spilled or split.
-  /// @param newRegs List to receive any new registers created. This needn't be
-  ///                empty initially, any existing registers are ignored.
-  LiveRangeEdit(LiveInterval &parent,
-                SmallVectorImpl<LiveInterval*> &newRegs,
-                Delegate *delegate = 0)
-    : parent_(parent), newRegs_(newRegs),
-      delegate_(delegate),
-      firstNew_(newRegs.size()),
-      scannedRemattable_(false) {}
-
-  LiveInterval &getParent() const { return parent_; }
-  unsigned getReg() const { return parent_.reg; }
-
-  /// Iterator for accessing the new registers added by this edit.
-  typedef SmallVectorImpl<LiveInterval*>::const_iterator iterator;
-  iterator begin() const { return newRegs_.begin()+firstNew_; }
-  iterator end() const { return newRegs_.end(); }
-  unsigned size() const { return newRegs_.size()-firstNew_; }
-  bool empty() const { return size() == 0; }
-  LiveInterval *get(unsigned idx) const { return newRegs_[idx+firstNew_]; }
-
-  ArrayRef<LiveInterval*> regs() const {
-    return makeArrayRef(newRegs_).slice(firstNew_);
-  }
-
-  /// createFrom - Create a new virtual register based on OldReg.
-  LiveInterval &createFrom(unsigned OldReg, LiveIntervals&, VirtRegMap&);
-
-  /// create - Create a new register with the same class and original slot as
-  /// parent.
-  LiveInterval &create(LiveIntervals &LIS, VirtRegMap &VRM) {
-    return createFrom(getReg(), LIS, VRM);
-  }
-
-  /// anyRematerializable - Return true if any parent values may be
-  /// rematerializable.
-  /// This function must be called before any rematerialization is attempted.
-  bool anyRematerializable(LiveIntervals&, const TargetInstrInfo&,
-                           AliasAnalysis*);
-
-  /// checkRematerializable - Manually add VNI to the list of rematerializable
-  /// values if DefMI may be rematerializable.
-  bool checkRematerializable(VNInfo *VNI, const MachineInstr *DefMI,
-                             const TargetInstrInfo&, AliasAnalysis*);
-
-  /// Remat - Information needed to rematerialize at a specific location.
-  struct Remat {
-    VNInfo *ParentVNI;      // parent_'s value at the remat location.
-    MachineInstr *OrigMI;   // Instruction defining ParentVNI.
-    explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI), OrigMI(0) {}
-  };
-
-  /// canRematerializeAt - Determine if ParentVNI can be rematerialized at
-  /// UseIdx. It is assumed that parent_.getVNINfoAt(UseIdx) == ParentVNI.
-  /// When cheapAsAMove is set, only cheap remats are allowed.
-  bool canRematerializeAt(Remat &RM,
-                          SlotIndex UseIdx,
-                          bool cheapAsAMove,
-                          LiveIntervals &lis);
-
-  /// rematerializeAt - Rematerialize RM.ParentVNI into DestReg by inserting an
-  /// instruction into MBB before MI. The new instruction is mapped, but
-  /// liveness is not updated.
-  /// Return the SlotIndex of the new instruction.
-  SlotIndex rematerializeAt(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg,
-                            const Remat &RM,
-                            LiveIntervals&,
-                            const TargetInstrInfo&,
-                            const TargetRegisterInfo&,
-                            bool Late = false);
-
-  /// markRematerialized - explicitly mark a value as rematerialized after doing
-  /// it manually.
-  void markRematerialized(const VNInfo *ParentVNI) {
-    rematted_.insert(ParentVNI);
-  }
-
-  /// didRematerialize - Return true if ParentVNI was rematerialized anywhere.
-  bool didRematerialize(const VNInfo *ParentVNI) const {
-    return rematted_.count(ParentVNI);
-  }
-
-  /// eraseVirtReg - Notify the delegate that Reg is no longer in use, and try
-  /// to erase it from LIS.
-  void eraseVirtReg(unsigned Reg, LiveIntervals &LIS);
-
-  /// eliminateDeadDefs - Try to delete machine instructions that are now dead
-  /// (allDefsAreDead returns true). This may cause live intervals to be trimmed
-  /// and further dead efs to be eliminated.
-  /// RegsBeingSpilled lists registers currently being spilled by the register
-  /// allocator.  These registers should not be split into new intervals
-  /// as currently those new intervals are not guaranteed to spill.
-  void eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                         LiveIntervals&, VirtRegMap&,
-                         const TargetInstrInfo&,
-                         ArrayRef<unsigned> RegsBeingSpilled 
-                          = ArrayRef<unsigned>());
-
-  /// calculateRegClassAndHint - Recompute register class and hint for each new
-  /// register.
-  void calculateRegClassAndHint(MachineFunction&, LiveIntervals&,
-                                const MachineLoopInfo&);
-};
-
-}
-
-#endif
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 48e1e4c..5a0d97d 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -14,7 +14,7 @@
 // the instruction, but are never used after the instruction (i.e., they are
 // killed).
 //
-// This class computes live variables using are sparse implementation based on
+// This class computes live variables using a sparse implementation based on
 // the machine code SSA form.  This class computes live variable information for
 // each virtual and _register allocatable_ physical register in a function.  It
 // uses the dominance properties of SSA form to efficiently compute live
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index ca8a8e8..1abb8f2 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -321,8 +321,8 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
 void MachineBasicBlock::removeLiveIn(unsigned Reg) {
   std::vector<unsigned>::iterator I =
     std::find(LiveIns.begin(), LiveIns.end(), Reg);
-  assert(I != LiveIns.end() && "Not a live in!");
-  LiveIns.erase(I);
+  if (I != LiveIns.end())
+    LiveIns.erase(I);
 }
 
 bool MachineBasicBlock::isLiveIn(unsigned Reg) const {
@@ -392,22 +392,44 @@ void MachineBasicBlock::updateTerminator() {
         TII->InsertBranch(*this, TBB, 0, Cond, dl);
       }
     } else {
+      // Walk through the successors and find the successor which is not
+      // a landing pad and is not the conditional branch destination (in TBB)
+      // as the fallthrough successor.
+      MachineBasicBlock *FallthroughBB = 0;
+      for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
+        if ((*SI)->isLandingPad() || *SI == TBB)
+          continue;
+        assert(!FallthroughBB && "Found more than one fallthrough successor.");
+        FallthroughBB = *SI;
+      }
+      if (!FallthroughBB && canFallThrough()) {
+        // We fallthrough to the same basic block as the conditional jump
+        // targets. Remove the conditional jump, leaving unconditional
+        // fallthrough.
+        // FIXME: This does not seem like a reasonable pattern to support, but it
+        // has been seen in the wild coming out of degenerate ARM test cases.
+        TII->RemoveBranch(*this);
+
+        // Finally update the unconditional successor to be reached via a branch
+        // if it would not be reached by fallthrough.
+        if (!isLayoutSuccessor(TBB))
+          TII->InsertBranch(*this, TBB, 0, Cond, dl);
+        return;
+      }
+
       // The block has a fallthrough conditional branch.
-      MachineBasicBlock *MBBA = *succ_begin();
-      MachineBasicBlock *MBBB = *llvm::next(succ_begin());
-      if (MBBA == TBB) std::swap(MBBB, MBBA);
       if (isLayoutSuccessor(TBB)) {
         if (TII->ReverseBranchCondition(Cond)) {
           // We can't reverse the condition, add an unconditional branch.
           Cond.clear();
-          TII->InsertBranch(*this, MBBA, 0, Cond, dl);
+          TII->InsertBranch(*this, FallthroughBB, 0, Cond, dl);
           return;
         }
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, MBBA, 0, Cond, dl);
-      } else if (!isLayoutSuccessor(MBBA)) {
+        TII->InsertBranch(*this, FallthroughBB, 0, Cond, dl);
+      } else if (!isLayoutSuccessor(FallthroughBB)) {
         TII->RemoveBranch(*this);
-        TII->InsertBranch(*this, TBB, MBBA, Cond, dl);
+        TII->InsertBranch(*this, TBB, FallthroughBB, Cond, dl);
       }
     }
   }
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 63892af..5ba6851 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -102,13 +102,13 @@ public:
   }
 
   /// \brief Iterator over blocks within the chain.
-  typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator iterator;
+  typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
 
   /// \brief Beginning of blocks within the chain.
-  iterator begin() const { return Blocks.begin(); }
+  iterator begin() { return Blocks.begin(); }
 
   /// \brief End of blocks within the chain.
-  iterator end() const { return Blocks.end(); }
+  iterator end() { return Blocks.end(); }
 
   /// \brief Merge a block chain into this one.
   ///
@@ -141,6 +141,14 @@ public:
     }
   }
 
+#ifndef NDEBUG
+  /// \brief Dump the blocks in this chain.
+  void dump() LLVM_ATTRIBUTE_USED {
+    for (iterator I = begin(), E = end(); I != E; ++I)
+      (*I)->dump();
+  }
+#endif // NDEBUG
+
   /// \brief Count of predecessors within the loop currently being processed.
   ///
   /// This count is updated at each loop we process to represent the number of
@@ -203,12 +211,15 @@ class MachineBlockPlacement : public MachineFunctionPass {
   void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
                   SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
                   const BlockFilterSet *BlockFilter = 0);
-  MachineBasicBlock *findBestLoopTop(MachineFunction &F,
-                                     MachineLoop &L,
+  MachineBasicBlock *findBestLoopTop(MachineLoop &L,
                                      const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(MachineFunction &F,
+                                      MachineLoop &L,
+                                      const BlockFilterSet &LoopBlockSet);
   void buildLoopChains(MachineFunction &F, MachineLoop &L);
+  void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
+                  const BlockFilterSet &LoopBlockSet);
   void buildCFGChains(MachineFunction &F);
-  void AlignLoops(MachineFunction &F);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -430,7 +441,6 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
   for (SmallVectorImpl<MachineBasicBlock *>::iterator WBI = WorkList.begin(),
                                                       WBE = WorkList.end();
        WBI != WBE; ++WBI) {
-    assert(!BlockFilter || BlockFilter->count(*WBI));
     BlockChain &SuccChain = *BlockToChain[*WBI];
     if (&SuccChain == &Chain) {
       DEBUG(dbgs() << "    " << getBlockName(*WBI)
@@ -533,16 +543,89 @@ void MachineBlockPlacement::buildChain(
 
 /// \brief Find the best loop top block for layout.
 ///
+/// Look for a block which is strictly better than the loop header for laying
+/// out at the top of the loop. This looks for one and only one pattern:
+/// a latch block with no conditional exit. This block will cause a conditional
+/// jump around it or will be the bottom of the loop if we lay it out in place,
+/// but if it it doesn't end up at the bottom of the loop for any reason,
+/// rotation alone won't fix it. Because such a block will always result in an
+/// unconditional jump (for the backedge) rotating it in front of the loop
+/// header is always profitable.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
+                                       const BlockFilterSet &LoopBlockSet) {
+  // Check that the header hasn't been fused with a preheader block due to
+  // crazy branches. If it has, we need to start with the header at the top to
+  // prevent pulling the preheader into the loop body.
+  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
+  if (!LoopBlockSet.count(*HeaderChain.begin()))
+    return L.getHeader();
+
+  DEBUG(dbgs() << "Finding best loop top for: "
+               << getBlockName(L.getHeader()) << "\n");
+
+  BlockFrequency BestPredFreq;
+  MachineBasicBlock *BestPred = 0;
+  for (MachineBasicBlock::pred_iterator PI = L.getHeader()->pred_begin(),
+                                        PE = L.getHeader()->pred_end();
+       PI != PE; ++PI) {
+    MachineBasicBlock *Pred = *PI;
+    if (!LoopBlockSet.count(Pred))
+      continue;
+    DEBUG(dbgs() << "    header pred: " << getBlockName(Pred) << ", "
+                 << Pred->succ_size() << " successors, "
+                 << MBFI->getBlockFreq(Pred) << " freq\n");
+    if (Pred->succ_size() > 1)
+      continue;
+
+    BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
+    if (!BestPred || PredFreq > BestPredFreq ||
+        (!(PredFreq < BestPredFreq) &&
+         Pred->isLayoutSuccessor(L.getHeader()))) {
+      BestPred = Pred;
+      BestPredFreq = PredFreq;
+    }
+  }
+
+  // If no direct predecessor is fine, just use the loop header.
+  if (!BestPred)
+    return L.getHeader();
+
+  // Walk backwards through any straight line of predecessors.
+  while (BestPred->pred_size() == 1 &&
+         (*BestPred->pred_begin())->succ_size() == 1 &&
+         *BestPred->pred_begin() != L.getHeader())
+    BestPred = *BestPred->pred_begin();
+
+  DEBUG(dbgs() << "    final top: " << getBlockName(BestPred) << "\n");
+  return BestPred;
+}
+
+
+/// \brief Find the best loop exiting block for layout.
+///
 /// This routine implements the logic to analyze the loop looking for the best
 /// block to layout at the top of the loop. Typically this is done to maximize
 /// fallthrough opportunities.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
-                                       MachineLoop &L,
-                                       const BlockFilterSet &LoopBlockSet) {
+MachineBlockPlacement::findBestLoopExit(MachineFunction &F,
+                                        MachineLoop &L,
+                                        const BlockFilterSet &LoopBlockSet) {
+  // We don't want to layout the loop linearly in all cases. If the loop header
+  // is just a normal basic block in the loop, we want to look for what block
+  // within the loop is the best one to layout at the top. However, if the loop
+  // header has be pre-merged into a chain due to predecessors not having
+  // analyzable branches, *and* the predecessor it is merged with is *not* part
+  // of the loop, rotating the header into the middle of the loop will create
+  // a non-contiguous range of blocks which is Very Bad. So start with the
+  // header and only rotate if safe.
+  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
+  if (!LoopBlockSet.count(*HeaderChain.begin()))
+    return 0;
+
   BlockFrequency BestExitEdgeFreq;
+  unsigned BestExitLoopDepth = 0;
   MachineBasicBlock *ExitingBB = 0;
-  MachineBasicBlock *LoopingBB = 0;
   // If there are exits to outer loops, loop rotation can severely limit
   // fallthrough opportunites unless it selects such an exit. Keep a set of
   // blocks where rotating to exit with that block will reach an outer loop.
@@ -565,15 +648,10 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
     // successor isn't found.
     MachineBasicBlock *OldExitingBB = ExitingBB;
     BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
-    // We also compute and store the best looping successor for use in layout.
-    MachineBasicBlock *BestLoopSucc = 0;
+    bool HasLoopingSucc = false;
     // FIXME: Due to the performance of the probability and weight routines in
-    // the MBPI analysis, we use the internal weights. This is only valid
-    // because it is purely a ranking function, we don't care about anything
-    // but the relative values.
-    uint32_t BestLoopSuccWeight = 0;
-    // FIXME: We also manually compute the probabilities to avoid quadratic
-    // behavior.
+    // the MBPI analysis, we use the internal weights and manually compute the
+    // probabilities to avoid quadratic behavior.
     uint32_t WeightScale = 0;
     uint32_t SumWeight = MBPI->getSumForBlock(*I, WeightScale);
     for (MachineBasicBlock::succ_iterator SI = (*I)->succ_begin(),
@@ -585,10 +663,8 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
         continue;
       BlockChain &SuccChain = *BlockToChain[*SI];
       // Don't split chains, either this chain or the successor's chain.
-      if (&Chain == &SuccChain || *SI != *SuccChain.begin()) {
-        DEBUG(dbgs() << "    " << (LoopBlockSet.count(*SI) ? "looping: "
-                                                           : "exiting: ")
-                     << getBlockName(*I) << " -> "
+      if (&Chain == &SuccChain) {
+        DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> "
                      << getBlockName(*SI) << " (chain conflict)\n");
         continue;
       }
@@ -597,60 +673,103 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
       if (LoopBlockSet.count(*SI)) {
         DEBUG(dbgs() << "    looping: " << getBlockName(*I) << " -> "
                      << getBlockName(*SI) << " (" << SuccWeight << ")\n");
-        if (BestLoopSucc && BestLoopSuccWeight >= SuccWeight)
-          continue;
-
-        BestLoopSucc = *SI;
-        BestLoopSuccWeight = SuccWeight;
+        HasLoopingSucc = true;
         continue;
       }
 
+      unsigned SuccLoopDepth = 0;
+      if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI)) {
+        SuccLoopDepth = ExitLoop->getLoopDepth();
+        if (ExitLoop->contains(&L))
+          BlocksExitingToOuterLoop.insert(*I);
+      }
+
       BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
       BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(*I) * SuccProb;
       DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> "
-                   << getBlockName(*SI) << " (" << ExitEdgeFreq << ")\n");
+                   << getBlockName(*SI) << " [L:" << SuccLoopDepth
+                   << "] (" << ExitEdgeFreq << ")\n");
       // Note that we slightly bias this toward an existing layout successor to
       // retain incoming order in the absence of better information.
       // FIXME: Should we bias this more strongly? It's pretty weak.
-      if (!ExitingBB || ExitEdgeFreq > BestExitEdgeFreq ||
+      if (!ExitingBB || BestExitLoopDepth < SuccLoopDepth ||
+          ExitEdgeFreq > BestExitEdgeFreq ||
           ((*I)->isLayoutSuccessor(*SI) &&
            !(ExitEdgeFreq < BestExitEdgeFreq))) {
         BestExitEdgeFreq = ExitEdgeFreq;
         ExitingBB = *I;
       }
-
-      if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI))
-        if (ExitLoop->contains(&L))
-          BlocksExitingToOuterLoop.insert(*I);
     }
 
     // Restore the old exiting state, no viable looping successor was found.
-    if (!BestLoopSucc) {
+    if (!HasLoopingSucc) {
       ExitingBB = OldExitingBB;
       BestExitEdgeFreq = OldBestExitEdgeFreq;
       continue;
     }
-
-    // If this was best exiting block thus far, also record the looping block.
-    if (ExitingBB == *I)
-      LoopingBB = BestLoopSucc;
   }
-  // Without a candidate exitting block or with only a single block in the
+  // Without a candidate exiting block or with only a single block in the
   // loop, just use the loop header to layout the loop.
   if (!ExitingBB || L.getNumBlocks() == 1)
-    return L.getHeader();
+    return 0;
 
   // Also, if we have exit blocks which lead to outer loops but didn't select
   // one of them as the exiting block we are rotating toward, disable loop
   // rotation altogether.
   if (!BlocksExitingToOuterLoop.empty() &&
       !BlocksExitingToOuterLoop.count(ExitingBB))
-    return L.getHeader();
+    return 0;
 
-  assert(LoopingBB && "All successors of a loop block are exit blocks!");
   DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n");
-  DEBUG(dbgs() << "  Best top block: " << getBlockName(LoopingBB) << "\n");
-  return LoopingBB;
+  return ExitingBB;
+}
+
+/// \brief Attempt to rotate an exiting block to the bottom of the loop.
+///
+/// Once we have built a chain, try to rotate it to line up the hot exit block
+/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
+/// branches. For example, if the loop has fallthrough into its header and out
+/// of its bottom already, don't rotate it.
+void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
+                                       MachineBasicBlock *ExitingBB,
+                                       const BlockFilterSet &LoopBlockSet) {
+  if (!ExitingBB)
+    return;
+
+  MachineBasicBlock *Top = *LoopChain.begin();
+  bool ViableTopFallthrough = false;
+  for (MachineBasicBlock::pred_iterator PI = Top->pred_begin(),
+                                        PE = Top->pred_end();
+       PI != PE; ++PI) {
+    BlockChain *PredChain = BlockToChain[*PI];
+    if (!LoopBlockSet.count(*PI) &&
+        (!PredChain || *PI == *llvm::prior(PredChain->end()))) {
+      ViableTopFallthrough = true;
+      break;
+    }
+  }
+
+  // If the header has viable fallthrough, check whether the current loop
+  // bottom is a viable exiting block. If so, bail out as rotating will
+  // introduce an unnecessary branch.
+  if (ViableTopFallthrough) {
+    MachineBasicBlock *Bottom = *llvm::prior(LoopChain.end());
+    for (MachineBasicBlock::succ_iterator SI = Bottom->succ_begin(),
+                                          SE = Bottom->succ_end();
+         SI != SE; ++SI) {
+      BlockChain *SuccChain = BlockToChain[*SI];
+      if (!LoopBlockSet.count(*SI) &&
+          (!SuccChain || *SI == *SuccChain->begin()))
+        return;
+    }
+  }
+
+  BlockChain::iterator ExitIt = std::find(LoopChain.begin(), LoopChain.end(),
+                                          ExitingBB);
+  if (ExitIt == LoopChain.end())
+    return;
+
+  std::rotate(LoopChain.begin(), llvm::next(ExitIt), LoopChain.end());
 }
 
 /// \brief Forms basic block chains from the natural loop structures.
@@ -669,8 +788,20 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
   BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
 
-  MachineBasicBlock *LayoutTop = findBestLoopTop(F, L, LoopBlockSet);
-  BlockChain &LoopChain = *BlockToChain[LayoutTop];
+  // First check to see if there is an obviously preferable top block for the
+  // loop. This will default to the header, but may end up as one of the
+  // predecessors to the header if there is one which will result in strictly
+  // fewer branches in the loop body.
+  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
+
+  // If we selected just the header for the loop top, look for a potentially
+  // profitable exit block in the event that rotating the loop can eliminate
+  // branches by placing an exit edge at the bottom.
+  MachineBasicBlock *ExitingBB = 0;
+  if (LoopTop == L.getHeader())
+    ExitingBB = findBestLoopExit(F, L, LoopBlockSet);
+
+  BlockChain &LoopChain = *BlockToChain[LoopTop];
 
   // FIXME: This is a really lame way of walking the chains in the loop: we
   // walk the blocks, and use a set to prevent visiting a particular chain
@@ -702,7 +833,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
       BlockWorkList.push_back(*Chain.begin());
   }
 
-  buildChain(LayoutTop, LoopChain, BlockWorkList, &LoopBlockSet);
+  buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet);
+  rotateLoop(LoopChain, ExitingBB, LoopBlockSet);
 
   DEBUG({
     // Crash at the end so we get all of the debugging output first.
@@ -714,7 +846,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
              << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
     }
     for (BlockChain::iterator BCI = LoopChain.begin(), BCE = LoopChain.end();
-         BCI != BCE; ++BCI)
+         BCI != BCE; ++BCI) {
+      dbgs() << "          ... " << getBlockName(*BCI) << "\n";
       if (!LoopBlockSet.erase(*BCI)) {
         // We don't mark the loop as bad here because there are real situations
         // where this can occur. For example, with an unanalyzable fallthrough
@@ -724,6 +857,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
                << "  Bad block:    " << getBlockName(*BCI) << "\n";
       }
+    }
 
     if (!LoopBlockSet.empty()) {
       BadLoop = true;
@@ -863,28 +997,33 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
   if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond))
     F.back().updateTerminator();
-}
 
-/// \brief Recursive helper to align a loop and any nested loops.
-static void AlignLoop(MachineFunction &F, MachineLoop *L, unsigned Align) {
-  // Recurse through nested loops.
-  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-    AlignLoop(F, *I, Align);
-
-  L->getTopBlock()->setAlignment(Align);
-}
-
-/// \brief Align loop headers to target preferred alignments.
-void MachineBlockPlacement::AlignLoops(MachineFunction &F) {
+  // Walk through the backedges of the function now that we have fully laid out
+  // the basic blocks and align the destination of each backedge. We don't rely
+  // on the loop info here so that we can align backedges in unnatural CFGs and
+  // backedges that were introduced purely because of the loop rotations done
+  // during this layout pass.
+  // FIXME: This isn't quite right, we shouldn't align backedges that result
+  // from blocks being sunken below the exit block for the function.
   if (F.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
     return;
-
   unsigned Align = TLI->getPrefLoopAlignment();
   if (!Align)
     return;  // Don't care about loop alignment.
 
-  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I)
-    AlignLoop(F, *I, Align);
+  SmallPtrSet<MachineBasicBlock *, 16> PreviousBlocks;
+  for (BlockChain::iterator BI = FunctionChain.begin(),
+                            BE = FunctionChain.end();
+       BI != BE; ++BI) {
+    PreviousBlocks.insert(*BI);
+    // Set alignment on the destination of all the back edges in the new
+    // ordering.
+    for (MachineBasicBlock::succ_iterator SI = (*BI)->succ_begin(),
+                                          SE = (*BI)->succ_end();
+         SI != SE; ++SI)
+      if (PreviousBlocks.count(*SI))
+        (*SI)->setAlignment(Align);
+  }
 }
 
 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
@@ -900,7 +1039,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
   assert(BlockToChain.empty());
 
   buildCFGChains(F);
-  AlignLoops(F);
 
   BlockToChain.clear();
   ChainAllocator.DestroyAll();
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 9aa74f1..9730eaa 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -43,9 +43,12 @@ namespace {
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
   private:
+    typedef SmallVector<unsigned, 4> DestList;
+    typedef DenseMap<unsigned, DestList> SourceMap;
+
     void SourceNoLongerAvailable(unsigned Reg,
-                               DenseMap<unsigned, unsigned> &SrcMap,
-                               DenseMap<unsigned, MachineInstr*> &AvailCopyMap);
+                                 SourceMap &SrcMap,
+                                 DenseMap<unsigned, MachineInstr*> &AvailCopyMap);
     bool CopyPropagateBlock(MachineBasicBlock &MBB);
   };
 }
@@ -57,24 +60,32 @@ INITIALIZE_PASS(MachineCopyPropagation, "machine-cp",
 
 void
 MachineCopyPropagation::SourceNoLongerAvailable(unsigned Reg,
-                              DenseMap<unsigned, unsigned> &SrcMap,
+                              SourceMap &SrcMap,
                               DenseMap<unsigned, MachineInstr*> &AvailCopyMap) {
-  DenseMap<unsigned, unsigned>::iterator SI = SrcMap.find(Reg);
+  SourceMap::iterator SI = SrcMap.find(Reg);
   if (SI != SrcMap.end()) {
-    unsigned MappedDef = SI->second;
-    // Source of copy is no longer available for propagation.
-    if (AvailCopyMap.erase(MappedDef)) {
-      for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR)
-        AvailCopyMap.erase(*SR);
+    const DestList& Defs = SI->second;
+    for (DestList::const_iterator I = Defs.begin(), E = Defs.end();
+         I != E; ++I) {
+      unsigned MappedDef = *I;
+      // Source of copy is no longer available for propagation.
+      if (AvailCopyMap.erase(MappedDef)) {
+        for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR)
+          AvailCopyMap.erase(*SR);
+      }
     }
   }
   for (const uint16_t *AS = TRI->getAliasSet(Reg); *AS; ++AS) {
     SI = SrcMap.find(*AS);
     if (SI != SrcMap.end()) {
-      unsigned MappedDef = SI->second;
-      if (AvailCopyMap.erase(MappedDef)) {
-        for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR)
-          AvailCopyMap.erase(*SR);
+      const DestList& Defs = SI->second;
+      for (DestList::const_iterator I = Defs.begin(), E = Defs.end();
+           I != E; ++I) {
+        unsigned MappedDef = *I;
+        if (AvailCopyMap.erase(MappedDef)) {
+          for (const uint16_t *SR = TRI->getSubRegisters(MappedDef); *SR; ++SR)
+            AvailCopyMap.erase(*SR);
+        }
       }
     }
   }
@@ -125,10 +136,10 @@ static bool isNopCopy(MachineInstr *CopyMI, unsigned Def, unsigned Src,
 }
 
 bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
-  SmallSetVector<MachineInstr*, 8> MaybeDeadCopies; // Candidates for deletion
-  DenseMap<unsigned, MachineInstr*> AvailCopyMap;   // Def -> available copies map
-  DenseMap<unsigned, MachineInstr*> CopyMap;        // Def -> copies map
-  DenseMap<unsigned, unsigned> SrcMap;              // Src -> Def map
+  SmallSetVector<MachineInstr*, 8> MaybeDeadCopies;  // Candidates for deletion
+  DenseMap<unsigned, MachineInstr*> AvailCopyMap;    // Def -> available copies map
+  DenseMap<unsigned, MachineInstr*> CopyMap;         // Def -> copies map
+  SourceMap SrcMap; // Src -> Def map
 
   bool Changed = false;
   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) {
@@ -213,7 +224,10 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // Remember source that's copied to Def. Once it's clobbered, then
       // it's no longer available for copy propagation.
-      SrcMap[Src] = Def;
+      if (std::find(SrcMap[Src].begin(), SrcMap[Src].end(), Def) ==
+          SrcMap[Src].end()) {
+        SrcMap[Src].push_back(Def);
+      }
 
       continue;
     }
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 8ab8b18..d8c2f6a 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -195,9 +195,10 @@ MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
 MachineMemOperand *
 MachineFunction::getMachineMemOperand(MachinePointerInfo PtrInfo, unsigned f,
                                       uint64_t s, unsigned base_alignment,
-                                      const MDNode *TBAAInfo) {
+                                      const MDNode *TBAAInfo,
+                                      const MDNode *Ranges) {
   return new (Allocator) MachineMemOperand(PtrInfo, f, s, base_alignment,
-                                           TBAAInfo);
+                                           TBAAInfo, Ranges);
 }
 
 MachineMemOperand *
@@ -284,7 +285,13 @@ void MachineFunction::dump() const {
 }
 
 void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
-  OS << "# Machine code for function " << Fn->getName() << ":\n";
+  OS << "# Machine code for function " << Fn->getName() << ": ";
+  if (RegInfo) {
+    OS << (RegInfo->isSSA() ? "SSA" : "Post SSA");
+    if (!RegInfo->tracksLiveness())
+      OS << ", not tracking liveness";
+  }
+  OS << '\n';
 
   // Print Frame Information
   FrameInfo->print(*this, OS);
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 43af1ad..e553a04 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -381,10 +381,11 @@ MachinePointerInfo MachinePointerInfo::getStack(int64_t Offset) {
 
 MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, unsigned f,
                                      uint64_t s, unsigned int a,
-                                     const MDNode *TBAAInfo)
+                                     const MDNode *TBAAInfo,
+                                     const MDNode *Ranges)
   : PtrInfo(ptrinfo), Size(s),
     Flags((f & ((1 << MOMaxBits) - 1)) | ((Log2_32(a) + 1) << MOMaxBits)),
-    TBAAInfo(TBAAInfo) {
+    TBAAInfo(TBAAInfo), Ranges(Ranges) {
   assert((PtrInfo.V == 0 || isa<PointerType>(PtrInfo.V->getType())) &&
          "invalid pointer value");
   assert(getBaseAlignment() == a && "Alignment is not a power of 2!");
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 428a9d9..8c562cc 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -80,6 +80,14 @@ namespace {
     MachineLoop *CurLoop;          // The current loop we are working on.
     MachineBasicBlock *CurPreheader; // The preheader for CurLoop.
 
+    // Exit blocks for CurLoop.
+    SmallVector<MachineBasicBlock*, 8> ExitBlocks;
+
+    bool isExitBlock(const MachineBasicBlock *MBB) const {
+      return std::find(ExitBlocks.begin(), ExitBlocks.end(), MBB) !=
+        ExitBlocks.end();
+    }
+
     // Track 'estimated' register pressure.
     SmallSet<unsigned, 32> RegSeen;
     SmallVector<unsigned, 8> RegPressure;
@@ -182,9 +190,9 @@ namespace {
     ///
     bool IsLoopInvariantInst(MachineInstr &I);
 
-    /// HasAnyPHIUse - Return true if the specified register is used by any
-    /// phi node.
-    bool HasAnyPHIUse(unsigned Reg) const;
+    /// HasLoopPHIUse - Return true if the specified instruction is used by any
+    /// phi node in the current loop.
+    bool HasLoopPHIUse(const MachineInstr *MI) const;
 
     /// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
     /// and an use in the current loop, return true if the target considered
@@ -197,7 +205,7 @@ namespace {
     /// CanCauseHighRegPressure - Visit BBs from header to current BB,
     /// check if hoisting an instruction of the given cost matrix can cause high
     /// register pressure.
-    bool CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost);
+    bool CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost, bool Cheap);
 
     /// UpdateBackTraceRegPressure - Traverse the back trace from header to
     /// the current block and update their register pressures to reflect the
@@ -348,6 +356,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
   while (!Worklist.empty()) {
     CurLoop = Worklist.pop_back_val();
     CurPreheader = 0;
+    ExitBlocks.clear();
 
     // If this is done before regalloc, only visit outer-most preheader-sporting
     // loops.
@@ -356,6 +365,8 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
       continue;
     }
 
+    CurLoop->getExitBlocks(ExitBlocks);
+
     if (!PreRegAlloc)
       HoistRegionPostRA();
     else {
@@ -478,6 +489,10 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
 /// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
 /// invariants out to the preheader.
 void MachineLICM::HoistRegionPostRA() {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
   unsigned NumRegs = TRI->getNumRegs();
   BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
   BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
@@ -514,25 +529,46 @@ void MachineLICM::HoistRegionPostRA() {
     }
   }
 
+  // Gather the registers read / clobbered by the terminator.
+  BitVector TermRegs(NumRegs);
+  MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
+  if (TI != Preheader->end()) {
+    for (unsigned i = 0, e = TI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = TI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      for (const uint16_t *AS = TRI->getOverlaps(Reg); *AS; ++AS)
+        TermRegs.set(*AS);
+    }
+  }
+
   // Now evaluate whether the potential candidates qualify.
   // 1. Check if the candidate defined register is defined by another
   //    instruction in the loop.
   // 2. If the candidate is a load from stack slot (always true for now),
   //    check if the slot is stored anywhere in the loop.
+  // 3. Make sure candidate def should not clobber
+  //    registers read by the terminator. Similarly its def should not be
+  //    clobbered by the terminator.
   for (unsigned i = 0, e = Candidates.size(); i != e; ++i) {
     if (Candidates[i].FI != INT_MIN &&
         StoredFIs.count(Candidates[i].FI))
       continue;
 
-    if (!PhysRegClobbers.test(Candidates[i].Def)) {
+    unsigned Def = Candidates[i].Def;
+    if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
       bool Safe = true;
       MachineInstr *MI = Candidates[i].MI;
       for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
         const MachineOperand &MO = MI->getOperand(j);
         if (!MO.isReg() || MO.isDef() || !MO.getReg())
           continue;
-        if (PhysRegDefs.test(MO.getReg()) ||
-            PhysRegClobbers.test(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+        if (PhysRegDefs.test(Reg) ||
+            PhysRegClobbers.test(Reg)) {
           // If it's using a non-loop-invariant register, then it's obviously
           // not safe to hoist.
           Safe = false;
@@ -571,7 +607,6 @@ void MachineLICM::AddToLiveIns(unsigned Reg) {
 /// dirty work.
 void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
   MachineBasicBlock *Preheader = getCurPreheader();
-  if (!Preheader) return;
 
   // Now move the instructions to the predecessor, inserting it before any
   // terminator instructions.
@@ -931,22 +966,40 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
 }
 
 
-/// HasAnyPHIUse - Return true if the specified register is used by any
-/// phi node.
-bool MachineLICM::HasAnyPHIUse(unsigned Reg) const {
-  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
-         UE = MRI->use_end(); UI != UE; ++UI) {
-    MachineInstr *UseMI = &*UI;
-    if (UseMI->isPHI())
-      return true;
-    // Look pass copies as well.
-    if (UseMI->isCopy()) {
-      unsigned Def = UseMI->getOperand(0).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Def) &&
-          HasAnyPHIUse(Def))
-        return true;
+/// HasLoopPHIUse - Return true if the specified instruction is used by a
+/// phi node and hoisting it could cause a copy to be inserted.
+bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
+  SmallVector<const MachineInstr*, 8> Work(1, MI);
+  do {
+    MI = Work.pop_back_val();
+    for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+      if (!MO->isReg() || !MO->isDef())
+        continue;
+      unsigned Reg = MO->getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
+           UE = MRI->use_end(); UI != UE; ++UI) {
+        MachineInstr *UseMI = &*UI;
+        // A PHI may cause a copy to be inserted.
+        if (UseMI->isPHI()) {
+          // A PHI inside the loop causes a copy because the live range of Reg is
+          // extended across the PHI.
+          if (CurLoop->contains(UseMI))
+            return true;
+          // A PHI in an exit block can cause a copy to be inserted if the PHI
+          // has multiple predecessors in the loop with different values.
+          // For now, approximate by rejecting all exit blocks.
+          if (isExitBlock(UseMI->getParent()))
+            return true;
+          continue;
+        }
+        // Look past copies as well.
+        if (UseMI->isCopy() && CurLoop->contains(UseMI))
+          Work.push_back(UseMI);
+      }
     }
-  }
+  } while (!Work.empty());
   return false;
 }
 
@@ -1014,7 +1067,8 @@ bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
 /// CanCauseHighRegPressure - Visit BBs from header to current BB, check
 /// if hoisting an instruction of the given cost matrix can cause high
 /// register pressure.
-bool MachineLICM::CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost) {
+bool MachineLICM::CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost,
+                                          bool CheapInstr) {
   for (DenseMap<unsigned, int>::iterator CI = Cost.begin(), CE = Cost.end();
        CI != CE; ++CI) {
     if (CI->second <= 0)
@@ -1023,6 +1077,12 @@ bool MachineLICM::CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost) {
     unsigned RCId = CI->first;
     unsigned Limit = RegLimit[RCId];
     int Cost = CI->second;
+
+    // Don't hoist cheap instructions if they would increase register pressure,
+    // even if we're under the limit.
+    if (CheapInstr)
+      return true;
+
     for (unsigned i = BackTrace.size(); i != 0; --i) {
       SmallVector<unsigned, 8> &RP = BackTrace[i-1];
       if (RP[RCId] + Cost >= Limit)
@@ -1085,87 +1145,95 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
   if (MI.isImplicitDef())
     return true;
 
-  // If the instruction is cheap, only hoist if it is re-materilizable. LICM
-  // will increase register pressure. It's probably not worth it if the
-  // instruction is cheap.
-  // Also hoist loads from constant memory, e.g. load from stubs, GOT. Hoisting
-  // these tend to help performance in low register pressure situation. The
-  // trade off is it may cause spill in high pressure situation. It will end up
-  // adding a store in the loop preheader. But the reload is no more expensive.
-  // The side benefit is these loads are frequently CSE'ed.
-  if (IsCheapInstruction(MI)) {
-    if (!TII->isTriviallyReMaterializable(&MI, AA))
-      return false;
-  } else {
-    // Estimate register pressure to determine whether to LICM the instruction.
-    // In low register pressure situation, we can be more aggressive about
-    // hoisting. Also, favors hoisting long latency instructions even in
-    // moderately high pressure situation.
-    // FIXME: If there are long latency loop-invariant instructions inside the
-    // loop at this point, why didn't the optimizer's LICM hoist them?
-    DenseMap<unsigned, int> Cost;
-    for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = MI.getOperand(i);
-      if (!MO.isReg() || MO.isImplicit())
-        continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
-        continue;
+  // Besides removing computation from the loop, hoisting an instruction has
+  // these effects:
+  //
+  // - The value defined by the instruction becomes live across the entire
+  //   loop. This increases register pressure in the loop.
+  //
+  // - If the value is used by a PHI in the loop, a copy will be required for
+  //   lowering the PHI after extending the live range.
+  //
+  // - When hoisting the last use of a value in the loop, that value no longer
+  //   needs to be live in the loop. This lowers register pressure in the loop.
+
+  bool CheapInstr = IsCheapInstruction(MI);
+  bool CreatesCopy = HasLoopPHIUse(&MI);
+
+  // Don't hoist a cheap instruction if it would create a copy in the loop.
+  if (CheapInstr && CreatesCopy) {
+    DEBUG(dbgs() << "Won't hoist cheap instr with loop PHI use: " << MI);
+    return false;
+  }
 
-      unsigned RCId, RCCost;
-      getRegisterClassIDAndCost(&MI, Reg, i, RCId, RCCost);
-      if (MO.isDef()) {
-        if (HasHighOperandLatency(MI, i, Reg)) {
-          ++NumHighLatency;
-          return true;
-        }
+  // Rematerializable instructions should always be hoisted since the register
+  // allocator can just pull them down again when needed.
+  if (TII->isTriviallyReMaterializable(&MI, AA))
+    return true;
 
-        DenseMap<unsigned, int>::iterator CI = Cost.find(RCId);
-        if (CI != Cost.end())
-          CI->second += RCCost;
-        else
-          Cost.insert(std::make_pair(RCId, RCCost));
-      } else if (isOperandKill(MO, MRI)) {
-        // Is a virtual register use is a kill, hoisting it out of the loop
-        // may actually reduce register pressure or be register pressure
-        // neutral.
-        DenseMap<unsigned, int>::iterator CI = Cost.find(RCId);
-        if (CI != Cost.end())
-          CI->second -= RCCost;
-        else
-          Cost.insert(std::make_pair(RCId, -RCCost));
+  // Estimate register pressure to determine whether to LICM the instruction.
+  // In low register pressure situation, we can be more aggressive about
+  // hoisting. Also, favors hoisting long latency instructions even in
+  // moderately high pressure situation.
+  // Cheap instructions will only be hoisted if they don't increase register
+  // pressure at all.
+  // FIXME: If there are long latency loop-invariant instructions inside the
+  // loop at this point, why didn't the optimizer's LICM hoist them?
+  DenseMap<unsigned, int> Cost;
+  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    unsigned RCId, RCCost;
+    getRegisterClassIDAndCost(&MI, Reg, i, RCId, RCCost);
+    if (MO.isDef()) {
+      if (HasHighOperandLatency(MI, i, Reg)) {
+        DEBUG(dbgs() << "Hoist High Latency: " << MI);
+        ++NumHighLatency;
+        return true;
       }
+      Cost[RCId] += RCCost;
+    } else if (isOperandKill(MO, MRI)) {
+      // Is a virtual register use is a kill, hoisting it out of the loop
+      // may actually reduce register pressure or be register pressure
+      // neutral.
+      Cost[RCId] -= RCCost;
     }
+  }
 
-    // Visit BBs from header to current BB, if hoisting this doesn't cause
-    // high register pressure, then it's safe to proceed.
-    if (!CanCauseHighRegPressure(Cost)) {
-      ++NumLowRP;
-      return true;
-    }
+  // Visit BBs from header to current BB, if hoisting this doesn't cause
+  // high register pressure, then it's safe to proceed.
+  if (!CanCauseHighRegPressure(Cost, CheapInstr)) {
+    DEBUG(dbgs() << "Hoist non-reg-pressure: " << MI);
+    ++NumLowRP;
+    return true;
+  }
 
-    // Do not "speculate" in high register pressure situation. If an
-    // instruction is not guaranteed to be executed in the loop, it's best to be
-    // conservative.
-    if (AvoidSpeculation &&
-        (!IsGuaranteedToExecute(MI.getParent()) && !MayCSE(&MI)))
-      return false;
+  // Don't risk increasing register pressure if it would create copies.
+  if (CreatesCopy) {
+    DEBUG(dbgs() << "Won't hoist instr with loop PHI use: " << MI);
+    return false;
+  }
 
-    // High register pressure situation, only hoist if the instruction is going
-    // to be remat'ed.
-    if (!TII->isTriviallyReMaterializable(&MI, AA) &&
-        !MI.isInvariantLoad(AA))
-      return false;
+  // Do not "speculate" in high register pressure situation. If an
+  // instruction is not guaranteed to be executed in the loop, it's best to be
+  // conservative.
+  if (AvoidSpeculation &&
+      (!IsGuaranteedToExecute(MI.getParent()) && !MayCSE(&MI))) {
+    DEBUG(dbgs() << "Won't speculate: " << MI);
+    return false;
   }
 
-  // If result(s) of this instruction is used by PHIs outside of the loop, then
-  // don't hoist it if the instruction because it will introduce an extra copy.
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
-    if (!MO.isReg() || !MO.isDef())
-      continue;
-    if (HasAnyPHIUse(MO.getReg()))
-      return false;
+  // High register pressure situation, only hoist if the instruction is going
+  // to be remat'ed.
+  if (!TII->isTriviallyReMaterializable(&MI, AA) &&
+      !MI.isInvariantLoad(AA)) {
+    DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
+    return false;
   }
 
   return true;
diff --git a/lib/CodeGen/MachinePassRegistry.cpp b/lib/CodeGen/MachinePassRegistry.cpp
index 58e067b..cb204fd 100644
--- a/lib/CodeGen/MachinePassRegistry.cpp
+++ b/lib/CodeGen/MachinePassRegistry.cpp
@@ -18,6 +18,19 @@ using namespace llvm;
 
 void MachinePassRegistryListener::anchor() { }
 
+/// setDefault - Set the default constructor by name.
+void MachinePassRegistry::setDefault(StringRef Name) {
+  MachinePassCtor Ctor = 0;
+  for(MachinePassRegistryNode *R = getList(); R; R = R->getNext()) {
+    if (R->getName() == Name) {
+      Ctor = R->getCtor();
+      break;
+    }
+  }
+  assert(Ctor && "Unregistered pass name");
+  setDefault(Ctor);
+}
+
 /// Add - Adds a function pass to the registration list.
 ///
 void MachinePassRegistry::Add(MachinePassRegistryNode *Node) {
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index f140dec..7ea1517 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -18,7 +18,7 @@
 using namespace llvm;
 
 MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI)
-  : TRI(&TRI), IsSSA(true) {
+  : TRI(&TRI), IsSSA(true), TracksLiveness(true) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
   UsedPhysRegs.resize(TRI.getNumRegs());
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 364a244..1d3241b 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -227,6 +227,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
     assert(RemainingCount == 0 && "Instruction count mismatch!");
     Scheduler->finishBlock();
   }
+  Scheduler->finalizeSchedule();
   DEBUG(LIS->print(dbgs()));
   return true;
 }
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 830a876..74ba94d 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -202,6 +202,7 @@ namespace {
     void report(const char *msg, const MachineInstr *MI);
     void report(const char *msg, const MachineOperand *MO, unsigned MONum);
 
+    void checkLiveness(const MachineOperand *MO, unsigned MONum);
     void markReachable(const MachineBasicBlock *MBB);
     void calcRegsPassed();
     void checkPHIOps(const MachineBasicBlock *MBB);
@@ -608,7 +609,9 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   }
 
   // Ensure non-terminators don't follow terminators.
-  if (MI->isTerminator()) {
+  // Ignore predicated terminators formed by if conversion.
+  // FIXME: If conversion shouldn't need to violate this rule.
+  if (MI->isTerminator() && !TII->isPredicated(MI)) {
     if (!FirstTerminator)
       FirstTerminator = MI;
   } else if (FirstTerminator) {
@@ -656,112 +659,9 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     const unsigned Reg = MO->getReg();
     if (!Reg)
       return;
+    if (MRI->tracksLiveness() && !MI->isDebugValue())
+      checkLiveness(MO, MONum);
 
-    // Check Live Variables.
-    if (MI->isDebugValue()) {
-      // Liveness checks are not valid for debug values.
-    } else if (MO->isUse() && !MO->isUndef()) {
-      regsLiveInButUnused.erase(Reg);
-
-      bool isKill = false;
-      unsigned defIdx;
-      if (MI->isRegTiedToDefOperand(MONum, &defIdx)) {
-        // A two-addr use counts as a kill if use and def are the same.
-        unsigned DefReg = MI->getOperand(defIdx).getReg();
-        if (Reg == DefReg)
-          isKill = true;
-        else if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-          report("Two-address instruction operands must be identical",
-                 MO, MONum);
-        }
-      } else
-        isKill = MO->isKill();
-
-      if (isKill)
-        addRegWithSubRegs(regsKilled, Reg);
-
-      // Check that LiveVars knows this kill.
-      if (LiveVars && TargetRegisterInfo::isVirtualRegister(Reg) &&
-          MO->isKill()) {
-        LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
-        if (std::find(VI.Kills.begin(),
-                      VI.Kills.end(), MI) == VI.Kills.end())
-          report("Kill missing from LiveVariables", MO, MONum);
-      }
-
-      // Check LiveInts liveness and kill.
-      if (TargetRegisterInfo::isVirtualRegister(Reg) &&
-          LiveInts && !LiveInts->isNotInMIMap(MI)) {
-        SlotIndex UseIdx = LiveInts->getInstructionIndex(MI).getRegSlot(true);
-        if (LiveInts->hasInterval(Reg)) {
-          const LiveInterval &LI = LiveInts->getInterval(Reg);
-          if (!LI.liveAt(UseIdx)) {
-            report("No live range at use", MO, MONum);
-            *OS << UseIdx << " is not live in " << LI << '\n';
-          }
-          // Check for extra kill flags.
-          // Note that we allow missing kill flags for now.
-          if (MO->isKill() && !LI.killedAt(UseIdx.getRegSlot())) {
-            report("Live range continues after kill flag", MO, MONum);
-            *OS << "Live range: " << LI << '\n';
-          }
-        } else {
-          report("Virtual register has no Live interval", MO, MONum);
-        }
-      }
-
-      // Use of a dead register.
-      if (!regsLive.count(Reg)) {
-        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-          // Reserved registers may be used even when 'dead'.
-          if (!isReserved(Reg))
-            report("Using an undefined physical register", MO, MONum);
-        } else {
-          BBInfo &MInfo = MBBInfoMap[MI->getParent()];
-          // We don't know which virtual registers are live in, so only complain
-          // if vreg was killed in this MBB. Otherwise keep track of vregs that
-          // must be live in. PHI instructions are handled separately.
-          if (MInfo.regsKilled.count(Reg))
-            report("Using a killed virtual register", MO, MONum);
-          else if (!MI->isPHI())
-            MInfo.vregsLiveIn.insert(std::make_pair(Reg, MI));
-        }
-      }
-    } else if (MO->isDef()) {
-      // Register defined.
-      // TODO: verify that earlyclobber ops are not used.
-      if (MO->isDead())
-        addRegWithSubRegs(regsDead, Reg);
-      else
-        addRegWithSubRegs(regsDefined, Reg);
-
-      // Verify SSA form.
-      if (MRI->isSSA() && TargetRegisterInfo::isVirtualRegister(Reg) &&
-          llvm::next(MRI->def_begin(Reg)) != MRI->def_end())
-        report("Multiple virtual register defs in SSA form", MO, MONum);
-
-      // Check LiveInts for a live range, but only for virtual registers.
-      if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
-          !LiveInts->isNotInMIMap(MI)) {
-        SlotIndex DefIdx = LiveInts->getInstructionIndex(MI).getRegSlot();
-        if (LiveInts->hasInterval(Reg)) {
-          const LiveInterval &LI = LiveInts->getInterval(Reg);
-          if (const VNInfo *VNI = LI.getVNInfoAt(DefIdx)) {
-            assert(VNI && "NULL valno is not allowed");
-            if (VNI->def != DefIdx && !MO->isEarlyClobber()) {
-              report("Inconsistent valno->def", MO, MONum);
-              *OS << "Valno " << VNI->id << " is not defined at "
-                  << DefIdx << " in " << LI << '\n';
-            }
-          } else {
-            report("No live range at def", MO, MONum);
-            *OS << DefIdx << " is not live in " << LI << '\n';
-          }
-        } else {
-          report("Virtual register has no Live interval", MO, MONum);
-        }
-      }
-    }
 
     // Check register classes.
     if (MONum < MCID.getNumOperands() && !MO->isImplicit()) {
@@ -853,6 +753,115 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
   }
 }
 
+void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
+  const MachineInstr *MI = MO->getParent();
+  const unsigned Reg = MO->getReg();
+
+  // Both use and def operands can read a register.
+  if (MO->readsReg()) {
+    regsLiveInButUnused.erase(Reg);
+
+    bool isKill = false;
+    unsigned defIdx;
+    if (MI->isRegTiedToDefOperand(MONum, &defIdx)) {
+      // A two-addr use counts as a kill if use and def are the same.
+      unsigned DefReg = MI->getOperand(defIdx).getReg();
+      if (Reg == DefReg)
+        isKill = true;
+      else if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        report("Two-address instruction operands must be identical", MO, MONum);
+      }
+    } else
+      isKill = MO->isKill();
+
+    if (isKill)
+      addRegWithSubRegs(regsKilled, Reg);
+
+    // Check that LiveVars knows this kill.
+    if (LiveVars && TargetRegisterInfo::isVirtualRegister(Reg) &&
+        MO->isKill()) {
+      LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
+      if (std::find(VI.Kills.begin(), VI.Kills.end(), MI) == VI.Kills.end())
+        report("Kill missing from LiveVariables", MO, MONum);
+    }
+
+    // Check LiveInts liveness and kill.
+    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+        LiveInts && !LiveInts->isNotInMIMap(MI)) {
+      SlotIndex UseIdx = LiveInts->getInstructionIndex(MI).getRegSlot(true);
+      if (LiveInts->hasInterval(Reg)) {
+        const LiveInterval &LI = LiveInts->getInterval(Reg);
+        if (!LI.liveAt(UseIdx)) {
+          report("No live range at use", MO, MONum);
+          *OS << UseIdx << " is not live in " << LI << '\n';
+        }
+        // Check for extra kill flags.
+        // Note that we allow missing kill flags for now.
+        if (MO->isKill() && !LI.killedAt(UseIdx.getRegSlot())) {
+          report("Live range continues after kill flag", MO, MONum);
+          *OS << "Live range: " << LI << '\n';
+        }
+      } else {
+        report("Virtual register has no Live interval", MO, MONum);
+      }
+    }
+
+    // Use of a dead register.
+    if (!regsLive.count(Reg)) {
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        // Reserved registers may be used even when 'dead'.
+        if (!isReserved(Reg))
+          report("Using an undefined physical register", MO, MONum);
+      } else {
+        BBInfo &MInfo = MBBInfoMap[MI->getParent()];
+        // We don't know which virtual registers are live in, so only complain
+        // if vreg was killed in this MBB. Otherwise keep track of vregs that
+        // must be live in. PHI instructions are handled separately.
+        if (MInfo.regsKilled.count(Reg))
+          report("Using a killed virtual register", MO, MONum);
+        else if (!MI->isPHI())
+          MInfo.vregsLiveIn.insert(std::make_pair(Reg, MI));
+      }
+    }
+  }
+
+  if (MO->isDef()) {
+    // Register defined.
+    // TODO: verify that earlyclobber ops are not used.
+    if (MO->isDead())
+      addRegWithSubRegs(regsDead, Reg);
+    else
+      addRegWithSubRegs(regsDefined, Reg);
+
+    // Verify SSA form.
+    if (MRI->isSSA() && TargetRegisterInfo::isVirtualRegister(Reg) &&
+        llvm::next(MRI->def_begin(Reg)) != MRI->def_end())
+      report("Multiple virtual register defs in SSA form", MO, MONum);
+
+    // Check LiveInts for a live range, but only for virtual registers.
+    if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
+        !LiveInts->isNotInMIMap(MI)) {
+      SlotIndex DefIdx = LiveInts->getInstructionIndex(MI).getRegSlot();
+      if (LiveInts->hasInterval(Reg)) {
+        const LiveInterval &LI = LiveInts->getInterval(Reg);
+        if (const VNInfo *VNI = LI.getVNInfoAt(DefIdx)) {
+          assert(VNI && "NULL valno is not allowed");
+          if (VNI->def != DefIdx && !MO->isEarlyClobber()) {
+            report("Inconsistent valno->def", MO, MONum);
+            *OS << "Valno " << VNI->id << " is not defined at "
+              << DefIdx << " in " << LI << '\n';
+          }
+        } else {
+          report("No live range at def", MO, MONum);
+          *OS << DefIdx << " is not live in " << LI << '\n';
+        }
+      } else {
+        report("Virtual register has no Live interval", MO, MONum);
+      }
+    }
+  }
+}
+
 void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) {
   BBInfo &MInfo = MBBInfoMap[MI->getParent()];
   set_union(MInfo.regsKilled, regsKilled);
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 6246c21..13d1bbc 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -37,8 +37,9 @@ static cl::opt<bool> DisableTailDuplicate("disable-tail-duplicate", cl::Hidden,
     cl::desc("Disable tail duplication"));
 static cl::opt<bool> DisableEarlyTailDup("disable-early-taildup", cl::Hidden,
     cl::desc("Disable pre-register allocation tail duplication"));
-static cl::opt<bool> EnableBlockPlacement("enable-block-placement",
-    cl::Hidden, cl::desc("Enable probability-driven block placement"));
+static cl::opt<bool> DisableBlockPlacement("disable-block-placement",
+    cl::Hidden, cl::desc("Disable the probability-driven block placement, and "
+                         "re-enable the old code placement pass"));
 static cl::opt<bool> EnableBlockPlacementStats("enable-block-placement-stats",
     cl::Hidden, cl::desc("Collect probability-driven block placement stats"));
 static cl::opt<bool> DisableCodePlace("disable-code-place", cl::Hidden,
@@ -272,11 +273,6 @@ AnalysisID TargetPassConfig::addPass(char &ID) {
   return FinalID;
 }
 
-void TargetPassConfig::printNoVerify(const char *Banner) const {
-  if (TM->shouldPrintMachineCode())
-    PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
-}
-
 void TargetPassConfig::printAndVerify(const char *Banner) const {
   if (TM->shouldPrintMachineCode())
     PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
@@ -394,16 +390,16 @@ void TargetPassConfig::addMachinePasses() {
 
   // Expand pseudo instructions before second scheduling pass.
   addPass(ExpandPostRAPseudosID);
-  printNoVerify("After ExpandPostRAPseudos");
+  printAndVerify("After ExpandPostRAPseudos");
 
   // Run pre-sched2 passes.
   if (addPreSched2())
-    printNoVerify("After PreSched2 passes");
+    printAndVerify("After PreSched2 passes");
 
   // Second pass scheduler.
   if (getOptLevel() != CodeGenOpt::None) {
     addPass(PostRASchedulerID);
-    printNoVerify("After PostRAScheduler");
+    printAndVerify("After PostRAScheduler");
   }
 
   // GC
@@ -416,7 +412,7 @@ void TargetPassConfig::addMachinePasses() {
     addBlockPlacement();
 
   if (addPreEmitPass())
-    printNoVerify("After PreEmit passes");
+    printAndVerify("After PreEmit passes");
 }
 
 /// Add passes that optimize machine instructions in SSA form.
@@ -601,24 +597,24 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 void TargetPassConfig::addMachineLateOptimization() {
   // Branch folding must be run after regalloc and prolog/epilog insertion.
   if (addPass(BranchFolderPassID) != &NoPassID)
-    printNoVerify("After BranchFolding");
+    printAndVerify("After BranchFolding");
 
   // Tail duplication.
   if (addPass(TailDuplicateID) != &NoPassID)
-    printNoVerify("After TailDuplicate");
+    printAndVerify("After TailDuplicate");
 
   // Copy propagation.
   if (addPass(MachineCopyPropagationID) != &NoPassID)
-    printNoVerify("After copy propagation pass");
+    printAndVerify("After copy propagation pass");
 }
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
   AnalysisID ID = &NoPassID;
-  if (EnableBlockPlacement) {
-    // MachineBlockPlacement is an experimental pass which is disabled by
-    // default currently. Eventually it should subsume CodePlacementOpt, so
-    // when enabled, the other is disabled.
+  if (!DisableBlockPlacement) {
+    // MachineBlockPlacement is a new pass which subsumes the functionality of
+    // CodPlacementOpt. The old code placement pass can be restored by
+    // disabling block placement, but eventually it will be removed.
     ID = addPass(MachineBlockPlacementID);
   } else {
     ID = addPass(CodePlacementOptID);
@@ -628,6 +624,6 @@ void TargetPassConfig::addBlockPlacement() {
     if (EnableBlockPlacementStats)
       addPass(MachineBlockPlacementStatsID);
 
-    printNoVerify("After machine block placement.");
+    printAndVerify("After machine block placement.");
   }
 }
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index 85119c9..b00eceb 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -14,11 +14,11 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "RegAllocBase.h"
-#include "LiveRangeEdit.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index f39a21c..77ee314 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -15,7 +15,6 @@
 #define DEBUG_TYPE "regalloc"
 #include "RegAllocBase.h"
 #include "LiveDebugVariables.h"
-#include "LiveRangeEdit.h"
 #include "RenderMachineFunction.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
@@ -24,6 +23,7 @@
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -187,7 +187,7 @@ void RABasic::spillReg(LiveInterval& VirtReg, unsigned PhysReg,
     unassign(SpilledVReg, PhysReg);
 
     // Spill the extracted interval.
-    LiveRangeEdit LRE(SpilledVReg, SplitVRegs);
+    LiveRangeEdit LRE(SpilledVReg, SplitVRegs, *MF, *LIS, VRM);
     spiller().spill(LRE);
   }
   // After extracting segments, the query's results are invalid. But keep the
@@ -287,7 +287,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
   if (!VirtReg.isSpillable())
     return ~0u;
-  LiveRangeEdit LRE(VirtReg, SplitVRegs);
+  LiveRangeEdit LRE(VirtReg, SplitVRegs, *MF, *LIS, VRM);
   spiller().spill(LRE);
 
   // The live virtual register requesting allocation was spilled, so tell
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index feec3d4..3f2a617 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -16,7 +16,6 @@
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
-#include "LiveRangeEdit.h"
 #include "RegAllocBase.h"
 #include "Spiller.h"
 #include "SpillPlacement.h"
@@ -29,6 +28,7 @@
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/EdgeBundles.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -428,13 +428,13 @@ void RAGreedy::enqueue(LiveInterval *LI) {
       Prio |= (1u << 30);
   }
 
-  Queue.push(std::make_pair(Prio, Reg));
+  Queue.push(std::make_pair(Prio, ~Reg));
 }
 
 LiveInterval *RAGreedy::dequeue() {
   if (Queue.empty())
     return 0;
-  LiveInterval *LI = &LIS->getInterval(Queue.top().second);
+  LiveInterval *LI = &LIS->getInterval(~Queue.top().second);
   Queue.pop();
   return LI;
 }
@@ -1183,7 +1183,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     return 0;
 
   // Prepare split editor.
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
+  LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
 
   // Assign all edge bundles to the preferred candidate, or NoCand.
@@ -1231,7 +1231,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
+  LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit, SplitSpillMode);
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
@@ -1512,7 +1512,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
                << '-' << Uses[BestAfter] << ", " << BestDiff
                << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
 
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
+  LiveRangeEdit LREdit(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   SE->reset(LREdit);
 
   SE->openIntv();
@@ -1644,7 +1644,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 
   // Finally spill VirtReg itself.
   NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
-  LiveRangeEdit LRE(VirtReg, NewVRegs, this);
+  LiveRangeEdit LRE(VirtReg, NewVRegs, *MF, *LIS, VRM, this);
   spiller().spill(LRE);
   setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
 
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 9fcf886..a284614 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -31,14 +31,15 @@
 
 #define DEBUG_TYPE "regalloc"
 
-#include "LiveRangeEdit.h"
 #include "RenderMachineFunction.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
 #include "RegisterCoalescer.h"
+#include "llvm/Module.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/RegAllocPBQP.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -56,6 +57,7 @@
 #include <limits>
 #include <memory>
 #include <set>
+#include <sstream>
 #include <vector>
 
 using namespace llvm;
@@ -69,6 +71,13 @@ pbqpCoalescing("pbqp-coalescing",
                 cl::desc("Attempt coalescing during PBQP register allocation."),
                 cl::init(false), cl::Hidden);
 
+#ifndef NDEBUG
+static cl::opt<bool>
+pbqpDumpGraphs("pbqp-dump-graphs",
+               cl::desc("Dump graphs for each function/round in the compilation unit."),
+               cl::init(false), cl::Hidden);
+#endif
+
 namespace {
 
 ///
@@ -187,7 +196,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
                                                 const RegSet &vregs) {
 
   typedef std::vector<const LiveInterval*> LIVector;
-
+  ArrayRef<SlotIndex> regMaskSlots = lis->getRegMaskSlots();
   MachineRegisterInfo *mri = &mf->getRegInfo();
   const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo();
 
@@ -224,7 +233,9 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
       }
     }
 
-    // Remove any physical registers which overlap.
+    RegSet overlappingPRegs;
+
+    // Record physical registers whose ranges overlap.
     for (RegSet::const_iterator pregItr = pregs.begin(),
                                 pregEnd = pregs.end();
          pregItr != pregEnd; ++pregItr) {
@@ -235,9 +246,41 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
         continue;
       }
 
-      if (!vregLI->overlaps(*pregLI)) {
-        continue;
+      if (vregLI->overlaps(*pregLI))
+        overlappingPRegs.insert(preg);      
+    }
+
+    // Record any overlaps with regmask operands.
+    BitVector regMaskOverlaps(tri->getNumRegs());
+    for (ArrayRef<SlotIndex>::iterator rmItr = regMaskSlots.begin(),
+                                       rmEnd = regMaskSlots.end();
+         rmItr != rmEnd; ++rmItr) {
+      SlotIndex rmIdx = *rmItr;
+      if (vregLI->liveAt(rmIdx)) {
+        MachineInstr *rmMI = lis->getInstructionFromIndex(rmIdx);
+        const uint32_t* regMask = 0;
+        for (MachineInstr::mop_iterator mopItr = rmMI->operands_begin(),
+                                        mopEnd = rmMI->operands_end();
+             mopItr != mopEnd; ++mopItr) {
+          if (mopItr->isRegMask()) {
+            regMask = mopItr->getRegMask();
+            break;
+          }
+        }
+        assert(regMask != 0 && "Couldn't find register mask.");
+        regMaskOverlaps.setBitsNotInMask(regMask);
       }
+    }
+
+    for (unsigned preg = 0; preg < tri->getNumRegs(); ++preg) {
+      if (regMaskOverlaps.test(preg))
+        overlappingPRegs.insert(preg);
+    }
+
+    for (RegSet::const_iterator pregItr = overlappingPRegs.begin(),
+                                pregEnd = overlappingPRegs.end();
+         pregItr != pregEnd; ++pregItr) {
+      unsigned preg = *pregItr;
 
       // Remove the register from the allowed set.
       VRAllowed::iterator eraseItr =
@@ -507,7 +550,7 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
     } else if (problem.isSpillOption(vreg, alloc)) {
       vregsToAlloc.erase(vreg);
       SmallVector<LiveInterval*, 8> newSpills;
-      LiveRangeEdit LRE(lis->getInterval(vreg), newSpills);
+      LiveRangeEdit LRE(lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
       spiller->spill(LRE);
 
       DEBUG(dbgs() << "VREG " << vreg << " -> SPILLED (Cost: "
@@ -633,6 +676,12 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   // Find the vreg intervals in need of allocation.
   findVRegIntervalsToAlloc();
 
+  const Function* func = mf->getFunction();
+  std::string fqn =
+    func->getParent()->getModuleIdentifier() + "." +
+    func->getName().str();
+  (void)fqn;
+
   // If there are non-empty intervals allocate them using pbqp.
   if (!vregsToAlloc.empty()) {
 
@@ -644,6 +693,20 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
       std::auto_ptr<PBQPRAProblem> problem =
         builder->build(mf, lis, loopInfo, vregsToAlloc);
+
+#ifndef NDEBUG
+      if (pbqpDumpGraphs) {
+        std::ostringstream rs;
+        rs << round;
+        std::string graphFileName(fqn + "." + rs.str() + ".pbqpgraph");
+        std::string tmp;
+        raw_fd_ostream os(graphFileName.c_str(), tmp);
+        DEBUG(dbgs() << "Dumping graph for round " << round << " to \""
+              << graphFileName << "\"\n");
+        problem->getGraph().dump(os);
+      }
+#endif
+
       PBQP::Solution solution =
         PBQP::HeuristicSolver<PBQP::Heuristics::Briggs>::solve(
           problem->getGraph());
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 2818f49..03bd82e 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -83,6 +83,11 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
   assert((NumPhysRegs == 0 || NumPhysRegs == TRI->getNumRegs()) &&
          "Target changed?");
 
+  // It is not possible to use the register scavenger after late optimization
+  // passes that don't preserve accurate liveness information.
+  assert(MRI->tracksLiveness() &&
+         "Cannot use register scavenger with inaccurate liveness");
+
   // Self-initialize.
   if (!MBB) {
     NumPhysRegs = TRI->getNumRegs();
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 6be1ab7..d46eb89 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -39,8 +39,8 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      LiveIntervals *lis)
   : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()),
     InstrItins(mf.getTarget().getInstrItineraryData()), LIS(lis),
-    IsPostRA(IsPostRAFlag), UnitLatencies(false), LoopRegs(MLI, MDT),
-    FirstDbgValue(0) {
+    IsPostRA(IsPostRAFlag), UnitLatencies(false), CanHandleTerminators(false),
+    LoopRegs(MLI, MDT), FirstDbgValue(0) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
   DbgValues.clear();
   assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
@@ -554,7 +554,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA) {
       continue;
     }
 
-    assert(!MI->isTerminator() && !MI->isLabel() &&
+    assert((!MI->isTerminator() || CanHandleTerminators) && !MI->isLabel() &&
            "Cannot schedule terminators or labels!");
 
     SUnit *SU = MISUnitMap[MI];
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7c4db97..0914c66 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1080,6 +1080,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
 
   // If the root changed (e.g. it was a dead load, update the root).
   DAG.setRoot(Dummy.getValue());
+  DAG.RemoveDeadNodes();
 }
 
 SDValue DAGCombiner::visit(SDNode *N) {
@@ -1452,16 +1453,14 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (VT.isInteger() && !VT.isVector()) {
     APInt LHSZero, LHSOne;
     APInt RHSZero, RHSOne;
-    APInt Mask = APInt::getAllOnesValue(VT.getScalarType().getSizeInBits());
-    DAG.ComputeMaskedBits(N0, Mask, LHSZero, LHSOne);
+    DAG.ComputeMaskedBits(N0, LHSZero, LHSOne);
 
     if (LHSZero.getBoolValue()) {
-      DAG.ComputeMaskedBits(N1, Mask, RHSZero, RHSOne);
+      DAG.ComputeMaskedBits(N1, RHSZero, RHSOne);
 
       // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
       // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
-      if ((RHSZero & (~LHSZero & Mask)) == (~LHSZero & Mask) ||
-          (LHSZero & (~RHSZero & Mask)) == (~RHSZero & Mask))
+      if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
         return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1);
     }
   }
@@ -1547,16 +1546,14 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
   // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
   APInt LHSZero, LHSOne;
   APInt RHSZero, RHSOne;
-  APInt Mask = APInt::getAllOnesValue(VT.getScalarType().getSizeInBits());
-  DAG.ComputeMaskedBits(N0, Mask, LHSZero, LHSOne);
+  DAG.ComputeMaskedBits(N0, LHSZero, LHSOne);
 
   if (LHSZero.getBoolValue()) {
-    DAG.ComputeMaskedBits(N1, Mask, RHSZero, RHSOne);
+    DAG.ComputeMaskedBits(N1, RHSZero, RHSOne);
 
     // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
     // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
-    if ((RHSZero & (~LHSZero & Mask)) == (~LHSZero & Mask) ||
-        (LHSZero & (~RHSZero & Mask)) == (~RHSZero & Mask))
+    if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
       return CombineTo(N, DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1),
                        DAG.getNode(ISD::CARRY_FALSE,
                                    N->getDebugLoc(), MVT::Glue));
@@ -2336,6 +2333,67 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
                        ORNode, N0.getOperand(1));
   }
 
+  // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
+  // Only perform this optimization after type legalization and before
+  // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
+  // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
+  // we don't want to undo this promotion.
+  // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
+  // on scalars.
+  if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      && Level == AfterLegalizeVectorOps) {
+    SDValue In0 = N0.getOperand(0);
+    SDValue In1 = N1.getOperand(0);
+    EVT In0Ty = In0.getValueType();
+    EVT In1Ty = In1.getValueType();
+    // If both incoming values are integers, and the original types are the same.
+    if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
+      SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), In0Ty, In0, In1);
+      SDValue BC = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, Op);
+      AddToWorkList(Op.getNode());
+      return BC;
+    }
+  }
+
+  // Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
+  // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
+  // If both shuffles use the same mask, and both shuffle within a single
+  // vector, then it is worthwhile to move the swizzle after the operation.
+  // The type-legalizer generates this pattern when loading illegal
+  // vector types from memory. In many cases this allows additional shuffle
+  // optimizations.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
+      N0.getOperand(1).getOpcode() == ISD::UNDEF &&
+      N1.getOperand(1).getOpcode() == ISD::UNDEF) {
+    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(N0);
+    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(N1);
+
+    assert(N0.getOperand(0).getValueType() == N1.getOperand(1).getValueType() &&
+           "Inputs to shuffles are not the same type");
+
+    unsigned NumElts = VT.getVectorNumElements();
+
+    // Check that both shuffles use the same mask. The masks are known to be of
+    // the same length because the result vector type is the same.
+    bool SameMask = true;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx0 = SVN0->getMaskElt(i);
+      int Idx1 = SVN1->getMaskElt(i);
+      if (Idx0 != Idx1) {
+        SameMask = false;
+        break;
+      }
+    }
+
+    if (SameMask) {
+      SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT,
+                               N0.getOperand(0), N1.getOperand(0));
+      AddToWorkList(Op.getNode());
+      return DAG.getVectorShuffle(VT, N->getDebugLoc(), Op,
+                                  DAG.getUNDEF(VT), &SVN0->getMask()[0]);
+    }
+  }
+
   return SDValue();
 }
 
@@ -3773,8 +3831,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N1C && N0.getOpcode() == ISD::CTLZ &&
       N1C->getAPIntValue() == Log2_32(VT.getSizeInBits())) {
     APInt KnownZero, KnownOne;
-    APInt Mask = APInt::getAllOnesValue(VT.getScalarType().getSizeInBits());
-    DAG.ComputeMaskedBits(N0.getOperand(0), Mask, KnownZero, KnownOne);
+    DAG.ComputeMaskedBits(N0.getOperand(0), KnownZero, KnownOne);
 
     // If any of the input bits are KnownOne, then the input couldn't be all
     // zeros, thus the result of the srl will always be zero.
@@ -3782,7 +3839,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
 
     // If all of the bits input the to ctlz node are known to be zero, then
     // the result of the ctlz is "32" and the result of the shift is one.
-    APInt UnknownBits = ~KnownZero & Mask;
+    APInt UnknownBits = ~KnownZero;
     if (UnknownBits == 0) return DAG.getConstant(1, VT);
 
     // Otherwise, check to see if there is exactly one bit input to the ctlz.
@@ -4298,12 +4355,17 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     // Only do this before legalize for now.
     if (VT.isVector() && !LegalOperations) {
       EVT N0VT = N0.getOperand(0).getValueType();
-        // We know that the # elements of the results is the same as the
-        // # elements of the compare (and the # elements of the compare result
-        // for that matter).  Check to see that they are the same size.  If so,
-        // we know that the element size of the sext'd result matches the
-        // element size of the compare operands.
-      if (VT.getSizeInBits() == N0VT.getSizeInBits())
+      // On some architectures (such as SSE/NEON/etc) the SETCC result type is
+      // of the same size as the compared operands. Only optimize sext(setcc())
+      // if this is the case.
+      EVT SVT = TLI.getSetCCResultType(N0VT);
+
+      // We know that the # elements of the results is the same as the
+      // # elements of the compare (and the # elements of the compare result
+      // for that matter).  Check to see that they are the same size.  If so,
+      // we know that the element size of the sext'd result matches the
+      // element size of the compare operands.
+      if (VT.getSizeInBits() == SVT.getSizeInBits())
         return DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
                              N0.getOperand(1),
                              cast<CondCodeSDNode>(N0.getOperand(2))->get());
@@ -4317,11 +4379,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         EVT MatchingVectorType =
           EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
                            N0VT.getVectorNumElements());
-        SDValue VsetCC =
-          DAG.getSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
-                        N0.getOperand(1),
-                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+
+        if (SVT == MatchingVectorType) {
+          SDValue VsetCC = DAG.getSetCC(N->getDebugLoc(), MatchingVectorType,
+                                 N0.getOperand(0), N0.getOperand(1),
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get());
+          return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+        }
       }
     }
 
@@ -4352,6 +4416,44 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   return SDValue();
 }
 
+// isTruncateOf - If N is a truncate of some other value, return true, record
+// the value being truncated in Op and which of Op's bits are zero in KnownZero.
+// This function computes KnownZero to avoid a duplicated call to
+// ComputeMaskedBits in the caller.
+static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
+                         APInt &KnownZero) {
+  APInt KnownOne;
+  if (N->getOpcode() == ISD::TRUNCATE) {
+    Op = N->getOperand(0);
+    DAG.ComputeMaskedBits(Op, KnownZero, KnownOne);
+    return true;
+  }
+
+  if (N->getOpcode() != ISD::SETCC || N->getValueType(0) != MVT::i1 ||
+      cast<CondCodeSDNode>(N->getOperand(2))->get() != ISD::SETNE)
+    return false;
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  assert(Op0.getValueType() == Op1.getValueType());
+
+  ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
+  ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+  if (COp0 && COp0->isNullValue())
+    Op = Op1;
+  else if (COp1 && COp1->isNullValue())
+    Op = Op0;
+  else
+    return false;
+
+  DAG.ComputeMaskedBits(Op, KnownZero, KnownOne);
+
+  if (!(KnownZero | APInt(Op.getValueSizeInBits(), 1)).isAllOnesValue())
+    return false;
+
+  return true;
+}
+
 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -4369,16 +4471,17 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   //      (zext (truncate x)) -> (truncate x)
   // This is valid when the truncated bits of x are already zero.
   // FIXME: We should extend this to work for vectors too.
-  if (N0.getOpcode() == ISD::TRUNCATE && !VT.isVector()) {
-    SDValue Op = N0.getOperand(0);
-    APInt TruncatedBits
-      = APInt::getBitsSet(Op.getValueSizeInBits(),
-                          N0.getValueSizeInBits(),
-                          std::min(Op.getValueSizeInBits(),
-                                   VT.getSizeInBits()));
-    APInt KnownZero, KnownOne;
-    DAG.ComputeMaskedBits(Op, TruncatedBits, KnownZero, KnownOne);
-    if (TruncatedBits == KnownZero) {
+  SDValue Op;
+  APInt KnownZero;
+  if (!VT.isVector() && isTruncateOf(DAG, N0, Op, KnownZero)) {
+    APInt TruncatedBits =
+      (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ?
+      APInt(Op.getValueSizeInBits(), 0) :
+      APInt::getBitsSet(Op.getValueSizeInBits(),
+                        N0.getValueSizeInBits(),
+                        std::min(Op.getValueSizeInBits(),
+                                 VT.getSizeInBits()));
+    if (TruncatedBits == (KnownZero & TruncatedBits)) {
       if (VT.bitsGT(Op.getValueType()))
         return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, Op);
       if (VT.bitsLT(Op.getValueType()))
@@ -5280,7 +5383,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
   // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
   // This often reduces constant pool loads.
-  if ((N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FABS) &&
+  if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(VT)) ||
+       (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(VT))) &&
       N0.getNode()->hasOneUse() && VT.isInteger() && !VT.isVector()) {
     SDValue NewConv = DAG.getNode(ISD::BITCAST, N0.getDebugLoc(), VT,
                                   N0.getOperand(0));
@@ -5667,6 +5771,24 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (N0CFP && N1CFP && VT != MVT::ppcf128)
     return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT, N0, N1);
 
+  // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
+  if (N1CFP && VT != MVT::ppcf128 && DAG.getTarget().Options.UnsafeFPMath) {
+    // Compute the reciprocal 1.0 / c2.
+    APFloat N1APF = N1CFP->getValueAPF();
+    APFloat Recip(N1APF.getSemantics(), 1); // 1.0
+    APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
+    // Only do the transform if the reciprocal is a legal fp immediate that
+    // isn't too nasty (eg NaN, denormal, ...).
+    if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
+        (!LegalOperations ||
+         // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
+         // backend)... we should handle this gracefully after Legalize.
+         // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) ||
+         TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
+         TLI.isFPImmLegal(Recip, VT)))
+      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0,
+                         DAG.getConstantFP(Recip, VT));
+  }
 
   // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
   if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
@@ -5931,7 +6053,7 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
 
   // Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading
   // constant pool values.
-  if (N0.getOpcode() == ISD::BITCAST &&
+  if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST &&
       !VT.isVector() &&
       N0.getNode()->hasOneUse() &&
       N0.getOperand(0).getValueType().isInteger()) {
@@ -5967,7 +6089,8 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
 
   // Transform fabs(bitconvert(x)) -> bitconvert(x&~sign) to avoid loading
   // constant pool values.
-  if (N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse() &&
+  if (!TLI.isFAbsFree(VT) && 
+      N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse() &&
       N0.getOperand(0).getValueType().isInteger() &&
       !N0.getOperand(0).getValueType().isVector()) {
     SDValue Int = N0.getOperand(0);
@@ -7628,8 +7751,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  assert(N0.getValueType().getVectorNumElements() == NumElts &&
-        "Vector shuffle must be normalized in DAG");
+  assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
 
   // Canonicalize shuffle undef, undef -> undef
   if (N0.getOpcode() == ISD::UNDEF && N1.getOpcode() == ISD::UNDEF)
@@ -7654,12 +7776,13 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     SmallVector<int, 8> NewMask;
     for (unsigned i = 0; i != NumElts; ++i) {
       int Idx = SVN->getMaskElt(i);
-      if (Idx < 0)
-        NewMask.push_back(Idx);
-      else if (Idx < (int)NumElts)
-        NewMask.push_back(Idx + NumElts);
-      else
-        NewMask.push_back(Idx - NumElts);
+      if (Idx >= 0) {
+        if (Idx < (int)NumElts)
+          Idx += NumElts;
+        else
+          Idx -= NumElts;
+      }
+      NewMask.push_back(Idx);
     }
     return DAG.getVectorShuffle(VT, N->getDebugLoc(), N1, DAG.getUNDEF(VT),
                                 &NewMask[0]);
@@ -7721,6 +7844,40 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
         return N0;
     }
   }
+
+  // If this shuffle node is simply a swizzle of another shuffle node,
+  // and it reverses the swizzle of the previous shuffle then we can
+  // optimize shuffle(shuffle(x, undef), undef) -> x.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
+      N1.getOpcode() == ISD::UNDEF) {
+
+    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
+
+    // Shuffle nodes can only reverse shuffles with a single non-undef value.
+    if (N0.getOperand(1).getOpcode() != ISD::UNDEF)
+      return SDValue();
+
+    // The incoming shuffle must be of the same type as the result of the
+    // current shuffle.
+    assert(OtherSV->getOperand(0).getValueType() == VT &&
+           "Shuffle types don't match");
+
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      assert(Idx < (int)NumElts && "Index references undef operand");
+      // Next, this index comes from the first value, which is the incoming
+      // shuffle. Adopt the incoming index.
+      if (Idx >= 0)
+        Idx = OtherSV->getMaskElt(Idx);
+
+      // The combined shuffle must map each index to itself.
+      if (Idx >= 0 && (unsigned)Idx != i)
+        return SDValue();
+    }
+
+    return OtherSV->getOperand(0);
+  }
+
   return SDValue();
 }
 
@@ -7796,7 +7953,8 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
         SDValue Elt = RHS.getOperand(i);
         if (!isa<ConstantSDNode>(Elt))
           return SDValue();
-        else if (cast<ConstantSDNode>(Elt)->isAllOnesValue())
+
+        if (cast<ConstantSDNode>(Elt)->isAllOnesValue())
           Indices.push_back(i);
         else if (cast<ConstantSDNode>(Elt)->isNullValue())
           Indices.push_back(NumElts);
@@ -7991,8 +8149,8 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
 
       if ((LLD->hasAnyUseOfValue(1) &&
            (LLD->isPredecessorOf(CondLHS) || LLD->isPredecessorOf(CondRHS))) ||
-          (LLD->hasAnyUseOfValue(1) &&
-           (LLD->isPredecessorOf(CondLHS) || LLD->isPredecessorOf(CondRHS))))
+          (RLD->hasAnyUseOfValue(1) &&
+           (RLD->isPredecessorOf(CondLHS) || RLD->isPredecessorOf(CondRHS))))
         return false;
 
       Addr = DAG.getNode(ISD::SELECT_CC, TheSelect->getDebugLoc(),
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 9f4a44a..0c1ac69 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -395,6 +395,13 @@ bool FastISel::SelectBinaryOp(const User *I, unsigned ISDOpcode) {
       ISDOpcode = ISD::SRA;
     }
 
+    // Transform "urem x, pow2" -> "and x, pow2-1".
+    if (ISDOpcode == ISD::UREM && isa<BinaryOperator>(I) &&
+        isPowerOf2_64(Imm)) {
+      --Imm;
+      ISDOpcode = ISD::AND;
+    }
+
     unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
                                       Op0IsKill, Imm, VT.getSimpleVT());
     if (ResultReg == 0) return false;
@@ -592,7 +599,18 @@ bool FastISel::SelectCall(const User *I) {
     if (!Reg)
       Reg = lookUpRegForValue(Address);
 
-    if (!Reg && isa<Instruction>(Address) &&
+    // If we have a VLA that has a "use" in a metadata node that's then used
+    // here but it has no other uses, then we have a problem. E.g.,
+    //
+    //   int foo (const int *x) {
+    //     char a[*x];
+    //     return 0;
+    //   }
+    //
+    // If we assign 'a' a vreg and fast isel later on has to use the selection
+    // DAG isel, it will want to copy the value to the vreg. However, there are
+    // no uses, which goes counter to what selection DAG isel expects.
+    if (!Reg && !Address->use_empty() && isa<Instruction>(Address) &&
         (!isa<AllocaInst>(Address) ||
          !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(Address))))
       Reg = FuncInfo.InitializeRegForValue(Address);
@@ -803,8 +821,11 @@ FastISel::SelectInstruction(const Instruction *I) {
 /// the CFG.
 void
 FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DL) {
-  if (FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
-    // The unconditional fall-through case, which needs no instructions.
+
+  if (FuncInfo.MBB->getBasicBlock()->size() > 1 && FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
+    // For more accurate line information if this is the only instruction
+    // in the block then emit it, otherwise we have the unconditional
+    // fall-through case, which needs no instructions.
   } else {
     // The unconditional branch case.
     TII.InsertBranch(*FuncInfo.MBB, MSucc, NULL,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 1b84b13..a96a997 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -718,10 +718,15 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_VOID:
-  case ISD::VAARG:
   case ISD::STACKSAVE:
     Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
     break;
+  case ISD::VAARG:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getValueType(0));
+    if (Action != TargetLowering::Promote)
+      Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
+    break;
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::EXTRACT_VECTOR_ELT:
@@ -1762,11 +1767,6 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
 // and leave the Hi part unset.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
                                             bool isSigned) {
-  // The input chain to this libcall is the entry node of the function.
-  // Legalizing the call will automatically add the previous call to the
-  // dependence.
-  SDValue InChain = DAG.getEntryNode();
-
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
@@ -1782,9 +1782,19 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
 
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
+  // By default, the input chain to this libcall is the entry node of the
+  // function. If the libcall is going to be emitted as a tail call then
+  // TLI.isUsedByReturnOnly will change it to the right chain if the return
+  // node which is being folded has a non-entry input chain.
+  SDValue InChain = DAG.getEntryNode();
+
   // isTailCall may be true since the callee does not reference caller stack
   // frame. Check if it's in the right position.
-  bool isTailCall = isInTailCallPosition(DAG, Node, TLI);
+  SDValue TCChain = InChain;
+  bool isTailCall = isInTailCallPosition(DAG, Node, TCChain, TLI);
+  if (isTailCall)
+    InChain = TCChain;
+
   std::pair<SDValue, SDValue> CallInfo =
     TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), isTailCall,
@@ -1820,7 +1830,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   std::pair<SDValue,SDValue> CallInfo =
   TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
-                  false, 0, TLI.getLibcallCallingConv(LC), false,
+                  false, 0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
                   /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
                   Callee, Args, DAG, dl);
 
@@ -3528,6 +3538,33 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
                                  Node->getOpcode() == ISD::SINT_TO_FP, dl);
     Results.push_back(Tmp1);
     break;
+  case ISD::VAARG: {
+    SDValue Chain = Node->getOperand(0); // Get the chain.
+    SDValue Ptr = Node->getOperand(1); // Get the pointer.
+
+    unsigned TruncOp;
+    if (OVT.isVector()) {
+      TruncOp = ISD::BITCAST;
+    } else {
+      assert(OVT.isInteger()
+        && "VAARG promotion is supported only for vectors or integer types");
+      TruncOp = ISD::TRUNCATE;
+    }
+
+    // Perform the larger operation, then convert back
+    Tmp1 = DAG.getVAArg(NVT, dl, Chain, Ptr, Node->getOperand(2),
+             Node->getConstantOperandVal(3));
+    Chain = Tmp1.getValue(1);
+
+    Tmp2 = DAG.getNode(TruncOp, dl, OVT, Tmp1);
+
+    // Modified the chain result - switch anything that used the old chain to
+    // use the new one.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp2);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    ReplacedNode(Node);
+    break;
+  }
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR: {
@@ -3601,6 +3638,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     break;
   }
   case ISD::FDIV:
+  case ISD::FREM:
   case ISD::FPOW: {
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 41506d1..95ddb1e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1362,7 +1362,7 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
   APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(N->getOperand(1), HighBitMask, KnownZero, KnownOne);
+  DAG.ComputeMaskedBits(N->getOperand(1), KnownZero, KnownOne);
 
   // If we don't know anything about the high bits, exit.
   if (((KnownZero|KnownOne) & HighBitMask) == 0)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 69c2100..e866445 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -521,6 +521,7 @@ private:
   SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N);
+  SDValue ScalarizeVecRes_VSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
   SDValue ScalarizeVecRes_SETCC(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 3ae8345..9fe4480 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -417,7 +417,8 @@ SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
 
   Op1 = DAG.getNode(ISD::AND, DL, VT, Op1, Mask);
   Op2 = DAG.getNode(ISD::AND, DL, VT, Op2, NotMask);
-  return DAG.getNode(ISD::OR, DL, VT, Op1, Op2);
+  SDValue Val = DAG.getNode(ISD::OR, DL, VT, Op1, Op2);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
 }
 
 SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index a8aee12..5f23f01 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -58,6 +58,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
   case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
   case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
+  case ISD::VSELECT:           R = ScalarizeVecRes_VSELECT(N); break;
   case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
   case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
@@ -226,6 +227,37 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   return InOp;
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
+  SDValue Cond = GetScalarizedVector(N->getOperand(0));
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  TargetLowering::BooleanContent ScalarBool = TLI.getBooleanContents(false);
+  TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true);
+  if (ScalarBool != VecBool) {
+    EVT CondVT = Cond.getValueType();
+    switch (ScalarBool) {
+      case TargetLowering::UndefinedBooleanContent:
+        break;
+      case TargetLowering::ZeroOrOneBooleanContent:
+        assert(VecBool == TargetLowering::UndefinedBooleanContent ||
+               VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent);
+        // Vector read from all ones, scalar expects a single 1 so mask.
+        Cond = DAG.getNode(ISD::AND, N->getDebugLoc(), CondVT,
+                           Cond, DAG.getConstant(1, CondVT));
+        break;
+      case TargetLowering::ZeroOrNegativeOneBooleanContent:
+        assert(VecBool == TargetLowering::UndefinedBooleanContent ||
+               VecBool == TargetLowering::ZeroOrOneBooleanContent);
+        // Vector reads from a one, scalar from all ones so sign extend.
+        Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), CondVT,
+                           Cond, DAG.getValueType(MVT::i1));
+        break;
+    }
+  }
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), Cond, LHS,
+                     GetScalarizedVector(N->getOperand(2)));
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
   return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index f44adfc..2cb5d37 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1587,6 +1587,7 @@ protected:
   std::vector<SUnit*> Queue;
   unsigned CurQueueId;
   bool TracksRegPressure;
+  bool SrcOrder;
 
   // SUnits - The SUnits for the current graph.
   std::vector<SUnit> *SUnits;
@@ -1612,11 +1613,12 @@ public:
   RegReductionPQBase(MachineFunction &mf,
                      bool hasReadyFilter,
                      bool tracksrp,
+                     bool srcorder,
                      const TargetInstrInfo *tii,
                      const TargetRegisterInfo *tri,
                      const TargetLowering *tli)
     : SchedulingPriorityQueue(hasReadyFilter),
-      CurQueueId(0), TracksRegPressure(tracksrp),
+      CurQueueId(0), TracksRegPressure(tracksrp), SrcOrder(srcorder),
       MF(mf), TII(tii), TRI(tri), TLI(tli), scheduleDAG(NULL) {
     if (TracksRegPressure) {
       unsigned NumRC = TRI->getNumRegClasses();
@@ -1731,10 +1733,12 @@ class RegReductionPriorityQueue : public RegReductionPQBase {
 public:
   RegReductionPriorityQueue(MachineFunction &mf,
                             bool tracksrp,
+                            bool srcorder,
                             const TargetInstrInfo *tii,
                             const TargetRegisterInfo *tri,
                             const TargetLowering *tli)
-    : RegReductionPQBase(mf, SF::HasReadyFilter, tracksrp, tii, tri, tli),
+    : RegReductionPQBase(mf, SF::HasReadyFilter, tracksrp, srcorder,
+                         tii, tri, tli),
       Picker(this) {}
 
   bool isBottomUp() const { return SF::IsBottomUp; }
@@ -2625,7 +2629,7 @@ void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
   if (!Disable2AddrHack)
     AddPseudoTwoAddrDeps();
   // Reroute edges to nodes with multiple uses.
-  if (!TracksRegPressure)
+  if (!TracksRegPressure && !SrcOrder)
     PrescheduleNodesWithMultipleUses();
   // Calculate node priorities.
   CalculateSethiUllmanNumbers();
@@ -2948,7 +2952,7 @@ llvm::createBURRListDAGScheduler(SelectionDAGISel *IS,
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
 
   BURegReductionPriorityQueue *PQ =
-    new BURegReductionPriorityQueue(*IS->MF, false, TII, TRI, 0);
+    new BURegReductionPriorityQueue(*IS->MF, false, false, TII, TRI, 0);
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
   PQ->setScheduleDAG(SD);
   return SD;
@@ -2962,7 +2966,7 @@ llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
 
   SrcRegReductionPriorityQueue *PQ =
-    new SrcRegReductionPriorityQueue(*IS->MF, false, TII, TRI, 0);
+    new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, 0);
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
   PQ->setScheduleDAG(SD);
   return SD;
@@ -2977,7 +2981,7 @@ llvm::createHybridListDAGScheduler(SelectionDAGISel *IS,
   const TargetLowering *TLI = &IS->getTargetLowering();
 
   HybridBURRPriorityQueue *PQ =
-    new HybridBURRPriorityQueue(*IS->MF, true, TII, TRI, TLI);
+    new HybridBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
 
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, PQ, OptLevel);
   PQ->setScheduleDAG(SD);
@@ -2993,7 +2997,7 @@ llvm::createILPListDAGScheduler(SelectionDAGISel *IS,
   const TargetLowering *TLI = &IS->getTargetLowering();
 
   ILPBURRPriorityQueue *PQ =
-    new ILPBURRPriorityQueue(*IS->MF, true, TII, TRI, TLI);
+    new ILPBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
   ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, PQ, OptLevel);
   PQ->setScheduleDAG(SD);
   return SD;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e3a7305..92671d1 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -62,6 +62,7 @@ static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
 static const fltSemantics *EVTToAPFloatSemantics(EVT VT) {
   switch (VT.getSimpleVT().SimpleTy) {
   default: llvm_unreachable("Unknown FP format");
+  case MVT::f16:     return &APFloat::IEEEhalf;
   case MVT::f32:     return &APFloat::IEEEsingle;
   case MVT::f64:     return &APFloat::IEEEdouble;
   case MVT::f80:     return &APFloat::x87DoubleExtended;
@@ -1042,7 +1043,7 @@ SDValue SelectionDAG::getConstantFP(double Val, EVT VT, bool isTarget) {
     return getConstantFP(APFloat((float)Val), VT, isTarget);
   else if (EltVT==MVT::f64)
     return getConstantFP(APFloat(Val), VT, isTarget);
-  else if (EltVT==MVT::f80 || EltVT==MVT::f128) {
+  else if (EltVT==MVT::f80 || EltVT==MVT::f128 || EltVT==MVT::f16) {
     bool ignored;
     APFloat apf = APFloat(Val);
     apf.convert(*EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
@@ -1627,7 +1628,7 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
 bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
                                      unsigned Depth) const {
   APInt KnownZero, KnownOne;
-  ComputeMaskedBits(Op, Mask, KnownZero, KnownOne, Depth);
+  ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
   assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
   return (KnownZero & Mask) == Mask;
 }
@@ -1636,15 +1637,12 @@ bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
 /// known to be either zero or one and return them in the KnownZero/KnownOne
 /// bitsets.  This code only analyzes bits in Mask, in order to short-circuit
 /// processing.
-void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
-                                     APInt &KnownZero, APInt &KnownOne,
-                                     unsigned Depth) const {
-  unsigned BitWidth = Mask.getBitWidth();
-  assert(BitWidth == Op.getValueType().getScalarType().getSizeInBits() &&
-         "Mask size mismatches value type size!");
+void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
+                                     APInt &KnownOne, unsigned Depth) const {
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
 
   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
-  if (Depth == 6 || Mask == 0)
+  if (Depth == 6)
     return;  // Limit search depth.
 
   APInt KnownZero2, KnownOne2;
@@ -1652,14 +1650,13 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   switch (Op.getOpcode()) {
   case ISD::Constant:
     // We know all of the bits for a constant!
-    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue() & Mask;
-    KnownZero = ~KnownOne & Mask;
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
+    KnownZero = ~KnownOne;
     return;
   case ISD::AND:
     // If either the LHS or the RHS are Zero, the result is zero.
-    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(0), Mask & ~KnownZero,
-                      KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
@@ -1669,9 +1666,8 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     KnownZero |= KnownZero2;
     return;
   case ISD::OR:
-    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(0), Mask & ~KnownOne,
-                      KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
@@ -1681,8 +1677,8 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     KnownOne |= KnownOne2;
     return;
   case ISD::XOR: {
-    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
@@ -1694,9 +1690,8 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     return;
   }
   case ISD::MUL: {
-    APInt Mask2 = APInt::getAllOnesValue(BitWidth);
-    ComputeMaskedBits(Op.getOperand(1), Mask2, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
@@ -1715,33 +1710,29 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     LeadZ = std::min(LeadZ, BitWidth);
     KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
                 APInt::getHighBitsSet(BitWidth, LeadZ);
-    KnownZero &= Mask;
     return;
   }
   case ISD::UDIV: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
-    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-    ComputeMaskedBits(Op.getOperand(0),
-                      AllOnes, KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
     unsigned LeadZ = KnownZero2.countLeadingOnes();
 
     KnownOne2.clearAllBits();
     KnownZero2.clearAllBits();
-    ComputeMaskedBits(Op.getOperand(1),
-                      AllOnes, KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
     unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
     if (RHSUnknownLeadingOnes != BitWidth)
       LeadZ = std::min(BitWidth,
                        LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
 
-    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ) & Mask;
+    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
     return;
   }
   case ISD::SELECT:
-    ComputeMaskedBits(Op.getOperand(2), Mask, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
@@ -1750,8 +1741,8 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     KnownZero &= KnownZero2;
     return;
   case ISD::SELECT_CC:
-    ComputeMaskedBits(Op.getOperand(3), Mask, KnownZero, KnownOne, Depth+1);
-    ComputeMaskedBits(Op.getOperand(2), Mask, KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
@@ -1783,8 +1774,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
       if (ShAmt >= BitWidth)
         return;
 
-      ComputeMaskedBits(Op.getOperand(0), Mask.lshr(ShAmt),
-                        KnownZero, KnownOne, Depth+1);
+      ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
       assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
       KnownZero <<= ShAmt;
       KnownOne  <<= ShAmt;
@@ -1801,13 +1791,12 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
       if (ShAmt >= BitWidth)
         return;
 
-      ComputeMaskedBits(Op.getOperand(0), (Mask << ShAmt),
-                        KnownZero, KnownOne, Depth+1);
+      ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
       assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
       KnownZero = KnownZero.lshr(ShAmt);
       KnownOne  = KnownOne.lshr(ShAmt);
 
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt) & Mask;
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
       KnownZero |= HighBits;  // High bits known zero.
     }
     return;
@@ -1819,15 +1808,11 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
       if (ShAmt >= BitWidth)
         return;
 
-      APInt InDemandedMask = (Mask << ShAmt);
       // If any of the demanded bits are produced by the sign extension, we also
       // demand the input sign bit.
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt) & Mask;
-      if (HighBits.getBoolValue())
-        InDemandedMask |= APInt::getSignBit(BitWidth);
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
 
-      ComputeMaskedBits(Op.getOperand(0), InDemandedMask, KnownZero, KnownOne,
-                        Depth+1);
+      ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
       assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
       KnownZero = KnownZero.lshr(ShAmt);
       KnownOne  = KnownOne.lshr(ShAmt);
@@ -1849,10 +1834,10 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
 
     // Sign extension.  Compute the demanded bits in the result that are not
     // present in the input.
-    APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits) & Mask;
+    APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);
 
     APInt InSignBit = APInt::getSignBit(EBits);
-    APInt InputDemandedBits = Mask & APInt::getLowBitsSet(BitWidth, EBits);
+    APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);
 
     // If the sign extended bits are demanded, we know that the sign
     // bit is demanded.
@@ -1860,8 +1845,9 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     if (NewBits.getBoolValue())
       InputDemandedBits |= InSignBit;
 
-    ComputeMaskedBits(Op.getOperand(0), InputDemandedBits,
-                      KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownOne &= InputDemandedBits;
+    KnownZero &= InputDemandedBits;
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 
     // If the sign bit of the input is known set or clear, then we know the
@@ -1889,22 +1875,23 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     return;
   }
   case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
     if (ISD::isZEXTLoad(Op.getNode())) {
-      LoadSDNode *LD = cast<LoadSDNode>(Op);
       EVT VT = LD->getMemoryVT();
       unsigned MemBits = VT.getScalarType().getSizeInBits();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits) & Mask;
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+    } else if (const MDNode *Ranges = LD->getRanges()) {
+      computeMaskedBitsLoad(*Ranges, KnownZero);
     }
     return;
   }
   case ISD::ZERO_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarType().getSizeInBits();
-    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
-    APInt InMask    = Mask.trunc(InBits);
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
     KnownZero = KnownZero.trunc(InBits);
     KnownOne = KnownOne.trunc(InBits);
-    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
     KnownZero |= NewBits;
@@ -1914,17 +1901,11 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarType().getSizeInBits();
     APInt InSignBit = APInt::getSignBit(InBits);
-    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits) & Mask;
-    APInt InMask = Mask.trunc(InBits);
-
-    // If any of the sign extended bits are demanded, we know that the sign
-    // bit is demanded. Temporarily set this bit in the mask for our callee.
-    if (NewBits.getBoolValue())
-      InMask |= InSignBit;
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
 
     KnownZero = KnownZero.trunc(InBits);
     KnownOne = KnownOne.trunc(InBits);
-    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
 
     // Note if the sign bit is known to be zero or one.
     bool SignBitKnownZero = KnownZero.isNegative();
@@ -1932,13 +1913,6 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     assert(!(SignBitKnownZero && SignBitKnownOne) &&
            "Sign bit can't be known to be both zero and one!");
 
-    // If the sign bit wasn't actually demanded by our caller, we don't
-    // want it set in the KnownZero and KnownOne result values. Reset the
-    // mask and reapply it to the result values.
-    InMask = Mask.trunc(InBits);
-    KnownZero &= InMask;
-    KnownOne  &= InMask;
-
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
 
@@ -1952,10 +1926,9 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   case ISD::ANY_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarType().getSizeInBits();
-    APInt InMask = Mask.trunc(InBits);
     KnownZero = KnownZero.trunc(InBits);
     KnownOne = KnownOne.trunc(InBits);
-    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
     return;
@@ -1963,10 +1936,9 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   case ISD::TRUNCATE: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarType().getSizeInBits();
-    APInt InMask = Mask.zext(InBits);
     KnownZero = KnownZero.zext(InBits);
     KnownOne = KnownOne.zext(InBits);
-    ComputeMaskedBits(Op.getOperand(0), InMask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
     KnownZero = KnownZero.trunc(BitWidth);
     KnownOne = KnownOne.trunc(BitWidth);
@@ -1975,9 +1947,8 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   case ISD::AssertZext: {
     EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
     APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
-    ComputeMaskedBits(Op.getOperand(0), Mask & InMask, KnownZero,
-                      KnownOne, Depth+1);
-    KnownZero |= (~InMask) & Mask;
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownZero |= (~InMask);
     return;
   }
   case ISD::FGETSIGN:
@@ -1994,8 +1965,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
         unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
         // NLZ can't be BitWidth with no sign bit
         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        ComputeMaskedBits(Op.getOperand(1), MaskV, KnownZero2, KnownOne2,
-                          Depth+1);
+        ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
 
         // If all of the MaskV bits are known to be zero, then we know the
         // output top bits are zero, because we now know that the output is
@@ -2003,7 +1973,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
         if ((KnownZero2 & MaskV) == MaskV) {
           unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
           // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2) & Mask;
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
         }
       }
     }
@@ -2014,13 +1984,11 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     // Output known-0 bits are known if clear or set in both the low clear bits
     // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
     // low 3 bits clear.
-    APInt Mask2 = APInt::getLowBitsSet(BitWidth,
-                                       BitWidth - Mask.countLeadingZeros());
-    ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
     unsigned KnownZeroOut = KnownZero2.countTrailingOnes();
 
-    ComputeMaskedBits(Op.getOperand(1), Mask2, KnownZero2, KnownOne2, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
     KnownZeroOut = std::min(KnownZeroOut,
                             KnownZero2.countTrailingOnes());
@@ -2044,7 +2012,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
         APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
-        ComputeMaskedBits(Op.getOperand(0), Mask2,KnownZero2,KnownOne2,Depth+1);
+        ComputeMaskedBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1);
 
         // The low bits of the first operand are unchanged by the srem.
         KnownZero = KnownZero2 & LowBits;
@@ -2059,10 +2027,6 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
         // the upper bits are all one.
         if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
           KnownOne |= ~LowBits;
-
-        KnownZero &= Mask;
-        KnownOne &= Mask;
-
         assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
       }
     }
@@ -2072,9 +2036,8 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
       const APInt &RA = Rem->getAPIntValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
-        APInt Mask2 = LowBits & Mask;
-        KnownZero |= ~LowBits & Mask;
-        ComputeMaskedBits(Op.getOperand(0), Mask2, KnownZero, KnownOne,Depth+1);
+        KnownZero |= ~LowBits;
+        ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne,Depth+1);
         assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
         break;
       }
@@ -2082,16 +2045,13 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
-    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-    ComputeMaskedBits(Op.getOperand(0), AllOnes, KnownZero, KnownOne,
-                      Depth+1);
-    ComputeMaskedBits(Op.getOperand(1), AllOnes, KnownZero2, KnownOne2,
-                      Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
 
     uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
     KnownOne.clearAllBits();
-    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & Mask;
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
     return;
   }
   case ISD::FrameIndex:
@@ -2111,8 +2071,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
     // Allow the target to implement this method for its nodes.
-    TLI.computeMaskedBitsForTargetNode(Op, Mask, KnownZero, KnownOne, *this,
-                                       Depth);
+    TLI.computeMaskedBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
     return;
   }
 }
@@ -2236,12 +2195,11 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         APInt KnownZero, KnownOne;
-        APInt Mask = APInt::getAllOnesValue(VTBits);
-        ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+        ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((KnownZero | APInt(VTBits, 1)) == Mask)
+        if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue())
           return VTBits;
 
         // If we are subtracting one from a positive number, there is no carry
@@ -2262,11 +2220,10 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0)))
       if (CLHS->isNullValue()) {
         APInt KnownZero, KnownOne;
-        APInt Mask = APInt::getAllOnesValue(VTBits);
-        ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+        ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((KnownZero | APInt(VTBits, 1)) == Mask)
+        if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue())
           return VTBits;
 
         // If the input is known to be positive (the sign bit is known clear),
@@ -2315,9 +2272,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
   APInt KnownZero, KnownOne;
-  APInt Mask = APInt::getAllOnesValue(VTBits);
-  ComputeMaskedBits(Op, Mask, KnownZero, KnownOne, Depth);
+  ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
 
+  APInt Mask;
   if (KnownZero.isNegative()) {        // sign bit is 0
     Mask = KnownZero;
   } else if (KnownOne.isNegative()) {  // sign bit is 1;
@@ -2471,7 +2428,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
       case ISD::FABS:
         V.clearSign();
         return getConstantFP(V, VT);
-      case ISD::FP_ROUND:
       case ISD::FP_EXTEND: {
         bool ignored;
         // This can return overflow, underflow, or inexact; we don't care.
@@ -3037,6 +2993,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
       default: break;
       }
     }
+
+    if (Opcode == ISD::FP_ROUND) {
+      APFloat V = N1CFP->getValueAPF();    // make copy
+      bool ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(*EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven, &ignored);
+      return getConstantFP(V, VT);
+    }
   }
 
   // Canonicalize an UNDEF to the RHS, even over a constant.
@@ -4170,7 +4136,8 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                       SDValue Ptr, SDValue Offset,
                       MachinePointerInfo PtrInfo, EVT MemVT,
                       bool isVolatile, bool isNonTemporal, bool isInvariant,
-                      unsigned Alignment, const MDNode *TBAAInfo) {
+                      unsigned Alignment, const MDNode *TBAAInfo,
+                      const MDNode *Ranges) {
   assert(Chain.getValueType() == MVT::Other && 
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
@@ -4192,7 +4159,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
-                            TBAAInfo);
+                            TBAAInfo, Ranges);
   return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
 }
 
@@ -4248,11 +4215,12 @@ SDValue SelectionDAG::getLoad(EVT VT, DebugLoc dl,
                               MachinePointerInfo PtrInfo,
                               bool isVolatile, bool isNonTemporal,
                               bool isInvariant, unsigned Alignment, 
-                              const MDNode *TBAAInfo) {
+                              const MDNode *TBAAInfo,
+                              const MDNode *Ranges) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
-                 PtrInfo, VT, isVolatile, isNonTemporal, isInvariant, Alignment, 
-                 TBAAInfo);
+                 PtrInfo, VT, isVolatile, isNonTemporal, isInvariant, Alignment,
+                 TBAAInfo, Ranges);
 }
 
 SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, DebugLoc dl, EVT VT,
@@ -6036,10 +6004,9 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   int64_t GVOffset = 0;
   if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
     unsigned PtrWidth = TLI.getPointerTy().getSizeInBits();
-    APInt AllOnes = APInt::getAllOnesValue(PtrWidth);
     APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
-    llvm::ComputeMaskedBits(const_cast<GlobalValue*>(GV), AllOnes,
-                            KnownZero, KnownOne, TLI.getTargetData());
+    llvm::ComputeMaskedBits(const_cast<GlobalValue*>(GV), KnownZero, KnownOne,
+                            TLI.getTargetData());
     unsigned AlignBits = KnownZero.countTrailingOnes();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
     if (Align)
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2ac9655..94cb958 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2804,11 +2804,11 @@ void SelectionDAGBuilder::visitExtractElement(const User &I) {
 }
 
 // Utility for visitShuffleVector - Return true if every element in Mask,
-// begining // from position Pos and ending in Pos+Size, falls within the
+// begining from position Pos and ending in Pos+Size, falls within the
 // specified sequential range [L, L+Pos). or is undef.
 static bool isSequentialInRange(const SmallVectorImpl<int> &Mask,
-                                int Pos, int Size, int Low) {
-  for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+                                unsigned Pos, unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
     if (Mask[i] >= 0 && Mask[i] != Low)
       return false;
   return true;
@@ -2878,10 +2878,9 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     SmallVector<int, 8> MappedOps;
     for (unsigned i = 0; i != MaskNumElts; ++i) {
       int Idx = Mask[i];
-      if (Idx < (int)SrcNumElts)
-        MappedOps.push_back(Idx);
-      else
-        MappedOps.push_back(Idx + MaskNumElts - SrcNumElts);
+      if (Idx >= (int)SrcNumElts)
+        Idx -= SrcNumElts - MaskNumElts;
+      MappedOps.push_back(Idx);
     }
 
     setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
@@ -2893,13 +2892,13 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     // Analyze the access pattern of the vector to see if we can extract
     // two subvectors and do the shuffle. The analysis is done by calculating
     // the range of elements the mask access on both vectors.
-    int MinRange[2] = { static_cast<int>(SrcNumElts+1),
-                        static_cast<int>(SrcNumElts+1)};
+    int MinRange[2] = { static_cast<int>(SrcNumElts),
+                        static_cast<int>(SrcNumElts)};
     int MaxRange[2] = {-1, -1};
 
     for (unsigned i = 0; i != MaskNumElts; ++i) {
       int Idx = Mask[i];
-      int Input = 0;
+      unsigned Input = 0;
       if (Idx < 0)
         continue;
 
@@ -2915,35 +2914,31 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
 
     // Check if the access is smaller than the vector size and can we find
     // a reasonable extract index.
-    int RangeUse[2] = { 2, 2 };  // 0 = Unused, 1 = Extract, 2 = Can not
-                                 // Extract.
+    int RangeUse[2] = { -1, -1 };  // 0 = Unused, 1 = Extract, -1 = Can not
+                                   // Extract.
     int StartIdx[2];  // StartIdx to extract from
-    for (int Input=0; Input < 2; ++Input) {
-      if (MinRange[Input] == (int)(SrcNumElts+1) && MaxRange[Input] == -1) {
+    for (unsigned Input = 0; Input < 2; ++Input) {
+      if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) {
         RangeUse[Input] = 0; // Unused
         StartIdx[Input] = 0;
-      } else if (MaxRange[Input] - MinRange[Input] < (int)MaskNumElts) {
-        // Fits within range but we should see if we can find a good
-        // start index that is a multiple of the mask length.
-        if (MaxRange[Input] < (int)MaskNumElts) {
-          RangeUse[Input] = 1; // Extract from beginning of the vector
-          StartIdx[Input] = 0;
-        } else {
-          StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
-          if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts &&
-              StartIdx[Input] + MaskNumElts <= SrcNumElts)
-            RangeUse[Input] = 1; // Extract from a multiple of the mask length.
-        }
+        continue;
       }
+
+      // Find a good start index that is a multiple of the mask length. Then
+      // see if the rest of the elements are in range.
+      StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
+      if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts &&
+          StartIdx[Input] + MaskNumElts <= SrcNumElts)
+        RangeUse[Input] = 1; // Extract from a multiple of the mask length.
     }
 
     if (RangeUse[0] == 0 && RangeUse[1] == 0) {
       setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
       return;
     }
-    else if (RangeUse[0] < 2 && RangeUse[1] < 2) {
+    if (RangeUse[0] >= 0 && RangeUse[1] >= 0) {
       // Extract appropriate subvector and generate a vector shuffle
-      for (int Input=0; Input < 2; ++Input) {
+      for (unsigned Input = 0; Input < 2; ++Input) {
         SDValue &Src = Input == 0 ? Src1 : Src2;
         if (RangeUse[Input] == 0)
           Src = DAG.getUNDEF(VT);
@@ -2956,12 +2951,13 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       SmallVector<int, 8> MappedOps;
       for (unsigned i = 0; i != MaskNumElts; ++i) {
         int Idx = Mask[i];
-        if (Idx < 0)
-          MappedOps.push_back(Idx);
-        else if (Idx < (int)SrcNumElts)
-          MappedOps.push_back(Idx - StartIdx[0]);
-        else
-          MappedOps.push_back(Idx - SrcNumElts - StartIdx[1] + MaskNumElts);
+        if (Idx >= 0) {
+          if (Idx < (int)SrcNumElts)
+            Idx -= StartIdx[0];
+          else
+            Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
+        }
+        MappedOps.push_back(Idx);
       }
 
       setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
@@ -2977,22 +2973,20 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   EVT PtrVT = TLI.getPointerTy();
   SmallVector<SDValue,8> Ops;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
-    if (Mask[i] < 0) {
-      Ops.push_back(DAG.getUNDEF(EltVT));
-    } else {
-      int Idx = Mask[i];
-      SDValue Res;
+    int Idx = Mask[i];
+    SDValue Res;
 
-      if (Idx < (int)SrcNumElts)
-        Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
-                          EltVT, Src1, DAG.getConstant(Idx, PtrVT));
-      else
-        Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
-                          EltVT, Src2,
-                          DAG.getConstant(Idx - SrcNumElts, PtrVT));
+    if (Idx < 0) {
+      Res = DAG.getUNDEF(EltVT);
+    } else {
+      SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
+      if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;
 
-      Ops.push_back(Res);
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
+                        EltVT, Src, DAG.getConstant(Idx, PtrVT));
     }
+
+    Ops.push_back(Res);
   }
 
   setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
@@ -3215,6 +3209,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   bool isInvariant = I.getMetadata("invariant.load") != 0;
   unsigned Alignment = I.getAlignment();
   const MDNode *TBAAInfo = I.getMetadata(LLVMContext::MD_tbaa);
+  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
@@ -3262,7 +3257,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
                             DAG.getConstant(Offsets[i], PtrVT));
     SDValue L = DAG.getLoad(ValueVTs[i], getCurDebugLoc(), Root,
                             A, MachinePointerInfo(SV, Offsets[i]), isVolatile,
-                            isNonTemporal, isInvariant, Alignment, TBAAInfo);
+                            isNonTemporal, isInvariant, Alignment, TBAAInfo,
+                            Ranges);
 
     Values[i] = L;
     Chains[ChainI] = L.getValue(1);
@@ -3586,6 +3582,12 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     }
 
     setValue(&I, Result);
+  } else {
+    // Assign order to result here. If the intrinsic does not produce a result,
+    // it won't be mapped to a SDNode and visit() will not assign it an order
+    // number.
+    ++SDNodeOrder;
+    AssignOrderingToNode(Result.getNode());
   }
 }
 
@@ -3627,17 +3629,6 @@ getF32Constant(SelectionDAG &DAG, unsigned Flt) {
   return DAG.getConstantFP(APFloat(APInt(32, Flt)), MVT::f32);
 }
 
-// implVisitAluOverflow - Lower arithmetic overflow instrinsics.
-const char *
-SelectionDAGBuilder::implVisitAluOverflow(const CallInst &I, ISD::NodeType Op) {
-  SDValue Op1 = getValue(I.getArgOperand(0));
-  SDValue Op2 = getValue(I.getArgOperand(1));
-
-  SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
-  setValue(&I, DAG.getNode(Op, getCurDebugLoc(), VTs, Op1, Op2));
-  return 0;
-}
-
 /// visitExp - Lower an exp intrinsic. Handles the special sequences for
 /// limited-precision mode.
 void
@@ -4397,9 +4388,8 @@ static unsigned getTruncatedArgReg(const SDValue &N) {
     const SDValue &CFR = Ext.getOperand(0);
     if (CFR.getOpcode() == ISD::CopyFromReg)
       return cast<RegisterSDNode>(CFR.getOperand(1))->getReg();
-    else
-      if (CFR.getOpcode() == ISD::TRUNCATE)
-        return getTruncatedArgReg(CFR);
+    if (CFR.getOpcode() == ISD::TRUNCATE)
+      return getTruncatedArgReg(CFR);
   }
   return 0;
 }
@@ -4428,7 +4418,7 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
   // Some arguments' frame index is recorded during argument lowering.
   Offset = FuncInfo.getArgumentFrameIndex(Arg);
   if (Offset)
-      Reg = TRI->getFrameRegister(MF);
+    Reg = TRI->getFrameRegister(MF);
 
   if (!Reg && N.getNode()) {
     if (N.getOpcode() == ISD::CopyFromReg)
@@ -4690,8 +4680,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       V = BCI->getOperand(0);
     const AllocaInst *AI = dyn_cast<AllocaInst>(V);
     // Don't handle byval struct arguments or VLAs, for example.
-    if (!AI)
+    if (!AI) {
+      DEBUG(dbgs() << "Dropping debug location info for:\n  " << DI << "\n");
+      DEBUG(dbgs() << "  Last seen at:\n    " << *V << "\n");
       return 0;
+    }
     DenseMap<const AllocaInst*, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(AI);
     if (SI == FuncInfo.StaticAllocaMap.end())
@@ -4837,7 +4830,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::x86_avx_vinsertf128_pd_256:
   case Intrinsic::x86_avx_vinsertf128_ps_256:
-  case Intrinsic::x86_avx_vinsertf128_si_256: {
+  case Intrinsic::x86_avx_vinsertf128_si_256:
+  case Intrinsic::x86_avx2_vinserti128: {
     DebugLoc dl = getCurDebugLoc();
     EVT DestVT = TLI.getValueType(I.getType());
     EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType());
@@ -4861,6 +4855,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::convertuu: {
     ISD::CvtCode Code = ISD::CVT_INVALID;
     switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::convertff:  Code = ISD::CVT_FF; break;
     case Intrinsic::convertfsi: Code = ISD::CVT_FS; break;
     case Intrinsic::convertfui: Code = ISD::CVT_FU; break;
@@ -5093,18 +5088,28 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   }
   case Intrinsic::uadd_with_overflow:
-    return implVisitAluOverflow(I, ISD::UADDO);
   case Intrinsic::sadd_with_overflow:
-    return implVisitAluOverflow(I, ISD::SADDO);
   case Intrinsic::usub_with_overflow:
-    return implVisitAluOverflow(I, ISD::USUBO);
   case Intrinsic::ssub_with_overflow:
-    return implVisitAluOverflow(I, ISD::SSUBO);
   case Intrinsic::umul_with_overflow:
-    return implVisitAluOverflow(I, ISD::UMULO);
-  case Intrinsic::smul_with_overflow:
-    return implVisitAluOverflow(I, ISD::SMULO);
+  case Intrinsic::smul_with_overflow: {
+    ISD::NodeType Op;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::uadd_with_overflow: Op = ISD::UADDO; break;
+    case Intrinsic::sadd_with_overflow: Op = ISD::SADDO; break;
+    case Intrinsic::usub_with_overflow: Op = ISD::USUBO; break;
+    case Intrinsic::ssub_with_overflow: Op = ISD::SSUBO; break;
+    case Intrinsic::umul_with_overflow: Op = ISD::UMULO; break;
+    case Intrinsic::smul_with_overflow: Op = ISD::SMULO; break;
+    }
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
 
+    SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
+    setValue(&I, DAG.getNode(Op, getCurDebugLoc(), VTs, Op1, Op2));
+    return 0;
+  }
   case Intrinsic::prefetch: {
     SDValue Ops[5];
     unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 8cf88e1..8393b41 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -556,8 +556,6 @@ private:
   void visitUserOp2(const Instruction &I) {
     llvm_unreachable("UserOp2 should not exist at instruction selection time!");
   }
-  
-  const char *implVisitAluOverflow(const CallInst &I, ISD::NodeType Op);
 
   void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 8aabc02..605509b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -508,7 +508,6 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
 
   Worklist.push_back(CurDAG->getRoot().getNode());
 
-  APInt Mask;
   APInt KnownZero;
   APInt KnownOne;
 
@@ -539,8 +538,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
       continue;
 
     unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
-    Mask = APInt::getAllOnesValue(SrcVT.getSizeInBits());
-    CurDAG->ComputeMaskedBits(Src, Mask, KnownZero, KnownOne);
+    CurDAG->ComputeMaskedBits(Src, KnownZero, KnownOne);
     FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, KnownZero, KnownOne);
   } while (!Worklist.empty());
 }
@@ -1444,7 +1442,7 @@ bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
   APInt NeededMask = DesiredMask & ~ActualMask;
 
   APInt KnownZero, KnownOne;
-  CurDAG->ComputeMaskedBits(LHS, NeededMask, KnownZero, KnownOne);
+  CurDAG->ComputeMaskedBits(LHS, KnownZero, KnownOne);
 
   // If all the missing bits in the or are already known to be set, match!
   if ((NeededMask & KnownOne) == NeededMask)
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 792de75..e341e15 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -39,28 +39,6 @@ static cl::opt<bool>
 AllowPromoteIntElem("promote-elements", cl::Hidden, cl::init(true),
   cl::desc("Allow promotion of integer vector element types"));
 
-namespace llvm {
-TLSModel::Model getTLSModel(const GlobalValue *GV, Reloc::Model reloc) {
-  bool isLocal = GV->hasLocalLinkage();
-  bool isDeclaration = GV->isDeclaration();
-  // FIXME: what should we do for protected and internal visibility?
-  // For variables, is internal different from hidden?
-  bool isHidden = GV->hasHiddenVisibility();
-
-  if (reloc == Reloc::PIC_) {
-    if (isLocal || isHidden)
-      return TLSModel::LocalDynamic;
-    else
-      return TLSModel::GeneralDynamic;
-  } else {
-    if (!isDeclaration || isHidden)
-      return TLSModel::LocalExec;
-    else
-      return TLSModel::InitialExec;
-  }
-}
-}
-
 /// InitLibcallNames - Set default libcall names.
 ///
 static void InitLibcallNames(const char **Names) {
@@ -1101,8 +1079,12 @@ unsigned TargetLowering::getJumpTableEncoding() const {
 SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                  SelectionDAG &DAG) const {
   // If our PIC model is GP relative, use the global offset table as the base.
-  if (getJumpTableEncoding() == MachineJumpTableInfo::EK_GPRel32BlockAddress)
+  unsigned JTEncoding = getJumpTableEncoding();
+
+  if ((JTEncoding == MachineJumpTableInfo::EK_GPRel64BlockAddress) ||
+      (JTEncoding == MachineJumpTableInfo::EK_GPRel32BlockAddress))
     return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
+
   return Table;
 }
 
@@ -1244,7 +1226,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (Depth != 0) {
       // If not at the root, Just compute the KnownZero/KnownOne bits to
       // simplify things downstream.
-      TLO.DAG.ComputeMaskedBits(Op, DemandedMask, KnownZero, KnownOne, Depth);
+      TLO.DAG.ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
       return false;
     }
     // If this is the root being simplified, allow it to have multiple uses,
@@ -1263,8 +1245,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   switch (Op.getOpcode()) {
   case ISD::Constant:
     // We know all of the bits for a constant!
-    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue() & NewMask;
-    KnownZero = ~KnownOne & NewMask;
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
+    KnownZero = ~KnownOne;
     return false;   // Don't fall through, will infinitely loop.
   case ISD::AND:
     // If the RHS is a constant, check to see if the LHS would be zero without
@@ -1274,8 +1256,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       APInt LHSZero, LHSOne;
       // Do not increment Depth here; that can cause an infinite loop.
-      TLO.DAG.ComputeMaskedBits(Op.getOperand(0), NewMask,
-                                LHSZero, LHSOne, Depth);
+      TLO.DAG.ComputeMaskedBits(Op.getOperand(0), LHSZero, LHSOne, Depth);
       // If the LHS already has zeros where RHSC does, this and is dead.
       if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
         return TLO.CombineTo(Op, Op.getOperand(0));
@@ -1386,8 +1367,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // bits on that side are also known to be set on the other side, turn this
     // into an AND, as we know the bits will be cleared.
     //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
-    if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known
-      if ((KnownOne & KnownOne2) == KnownOne) {
+    // NB: it is okay if more bits are known than are requested
+    if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known on one side 
+      if (KnownOne == KnownOne2) { // set bits are the same on both sides
         EVT VT = Op.getValueType();
         SDValue ANDC = TLO.DAG.getConstant(~KnownOne & NewMask, VT);
         return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT,
@@ -1725,11 +1707,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
     // If the sign bit is known one, the top bits match.
     if (KnownOne.intersects(InSignBit)) {
-      KnownOne  |= NewBits;
-      KnownZero &= ~NewBits;
+      KnownOne |= NewBits;
+      assert((KnownZero & NewBits) == 0);
     } else {   // Otherwise, top bits aren't known.
-      KnownOne  &= ~NewBits;
-      KnownZero &= ~NewBits;
+      assert((KnownOne & NewBits) == 0);
+      assert((KnownZero & NewBits) == 0);
     }
     break;
   }
@@ -1863,7 +1845,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   // FALL THROUGH
   default:
     // Just use ComputeMaskedBits to compute output bits.
-    TLO.DAG.ComputeMaskedBits(Op, NewMask, KnownZero, KnownOne, Depth);
+    TLO.DAG.ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
     break;
   }
 
@@ -1879,7 +1861,6 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 /// in Mask are known to be either zero or one and return them in the
 /// KnownZero/KnownOne bitsets.
 void TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                    const APInt &Mask,
                                                     APInt &KnownZero,
                                                     APInt &KnownOne,
                                                     const SelectionDAG &DAG,
@@ -1890,7 +1871,7 @@ void TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
           Op.getOpcode() == ISD::INTRINSIC_VOID) &&
          "Should use MaskedValueIsZero if you don't know whether Op"
          " is a target node!");
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
 }
 
 /// ComputeNumSignBitsForTargetNode - This method can be implemented by
@@ -1934,9 +1915,8 @@ static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) {
   // Fall back to ComputeMaskedBits to catch other known cases.
   EVT OpVT = Val.getValueType();
   unsigned BitWidth = OpVT.getScalarType().getSizeInBits();
-  APInt Mask = APInt::getAllOnesValue(BitWidth);
   APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(Val, Mask, KnownZero, KnownOne);
+  DAG.ComputeMaskedBits(Val, KnownZero, KnownOne);
   return (KnownZero.countPopulation() == BitWidth - 1) &&
          (KnownOne.countPopulation() == 1);
 }
@@ -2432,8 +2412,15 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
   if (N0 == N1) {
     // We can always fold X == X for integer setcc's.
-    if (N0.getValueType().isInteger())
-      return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
+    if (N0.getValueType().isInteger()) {
+      switch (getBooleanContents(N0.getValueType().isVector())) {
+      case UndefinedBooleanContent: 
+      case ZeroOrOneBooleanContent: 
+        return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
+      case ZeroOrNegativeOneBooleanContent:
+        return DAG.getConstant(ISD::isTrueWhenEqual(Cond) ? -1 : 0, VT);
+      }
+    }
     unsigned UOF = ISD::getUnorderedFlavor(Cond);
     if (UOF == 2)   // FP operators that are undefined on NaNs.
       return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
@@ -2467,6 +2454,10 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         }
       }
 
+      // If RHS is a legal immediate value for a compare instruction, we need
+      // to be careful about increasing register pressure needlessly.
+      bool LegalRHSImm = false;
+
       if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(N1)) {
         if (ConstantSDNode *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
           // Turn (X+C1) == C2 --> X == C2-C1
@@ -2501,25 +2492,33 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                            Cond);
           }
         }
+
+        // Could RHSC fold directly into a compare?
+        if (RHSC->getValueType(0).getSizeInBits() <= 64)
+          LegalRHSImm = isLegalICmpImmediate(RHSC->getSExtValue());
       }
 
       // Simplify (X+Z) == X -->  Z == 0
-      if (N0.getOperand(0) == N1)
-        return DAG.getSetCC(dl, VT, N0.getOperand(1),
-                        DAG.getConstant(0, N0.getValueType()), Cond);
-      if (N0.getOperand(1) == N1) {
-        if (DAG.isCommutativeBinOp(N0.getOpcode()))
-          return DAG.getSetCC(dl, VT, N0.getOperand(0),
-                          DAG.getConstant(0, N0.getValueType()), Cond);
-        else if (N0.getNode()->hasOneUse()) {
-          assert(N0.getOpcode() == ISD::SUB && "Unexpected operation!");
-          // (Z-X) == X  --> Z == X<<1
-          SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(),
-                                     N1,
+      // Don't do this if X is an immediate that can fold into a cmp
+      // instruction and X+Z has other uses. It could be an induction variable
+      // chain, and the transform would increase register pressure.
+      if (!LegalRHSImm || N0.getNode()->hasOneUse()) {
+        if (N0.getOperand(0) == N1)
+          return DAG.getSetCC(dl, VT, N0.getOperand(1),
+                              DAG.getConstant(0, N0.getValueType()), Cond);
+        if (N0.getOperand(1) == N1) {
+          if (DAG.isCommutativeBinOp(N0.getOpcode()))
+            return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                                DAG.getConstant(0, N0.getValueType()), Cond);
+          else if (N0.getNode()->hasOneUse()) {
+            assert(N0.getOpcode() == ISD::SUB && "Unexpected operation!");
+            // (Z-X) == X  --> Z == X<<1
+            SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(), N1,
                        DAG.getConstant(1, getShiftAmountTy(N1.getValueType())));
-          if (!DCI.isCalledByLegalizer())
-            DCI.AddToWorklist(SH.getNode());
-          return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond);
+            if (!DCI.isCalledByLegalizer())
+              DCI.AddToWorklist(SH.getNode());
+            return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond);
+          }
         }
       }
     }
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index c5bd3a3..26cf259 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -34,7 +34,8 @@ void SlotIndexes::releaseMemory() {
   mi2iMap.clear();
   MBBRanges.clear();
   idx2MBBMap.clear();
-  clearList();
+  indexList.clear();
+  ileAllocator.Reset();
 }
 
 bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
@@ -45,17 +46,15 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   // iterator in lock-step (though skipping it over indexes which have
   // null pointers in the instruction field).
   // At each iteration assert that the instruction pointed to in the index
-  // is the same one pointed to by the MI iterator. This 
+  // is the same one pointed to by the MI iterator. This
 
   // FIXME: This can be simplified. The mi2iMap_, Idx2MBBMap, etc. should
   // only need to be set up once after the first numbering is computed.
 
   mf = &fn;
-  initList();
 
   // Check that the list contains only the sentinal.
-  assert(indexListHead->getNext() == 0 &&
-         "Index list non-empty at initial numbering?");
+  assert(indexList.empty() && "Index list non-empty at initial numbering?");
   assert(idx2MBBMap.empty() &&
          "Index -> MBB mapping non-empty at initial numbering?");
   assert(MBBRanges.empty() &&
@@ -68,7 +67,7 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   MBBRanges.resize(mf->getNumBlockIDs());
   idx2MBBMap.reserve(mf->size());
 
-  push_back(createEntry(0, index));
+  indexList.push_back(createEntry(0, index));
 
   // Iterate over the function.
   for (MachineFunction::iterator mbbItr = mf->begin(), mbbEnd = mf->end();
@@ -76,7 +75,7 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
     MachineBasicBlock *mbb = &*mbbItr;
 
     // Insert an index for the MBB start.
-    SlotIndex blockStartIndex(back(), SlotIndex::Slot_Block);
+    SlotIndex blockStartIndex(&indexList.back(), SlotIndex::Slot_Block);
 
     for (MachineBasicBlock::iterator miItr = mbb->begin(), miEnd = mbb->end();
          miItr != miEnd; ++miItr) {
@@ -85,20 +84,20 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
         continue;
 
       // Insert a store index for the instr.
-      push_back(createEntry(mi, index += SlotIndex::InstrDist));
+      indexList.push_back(createEntry(mi, index += SlotIndex::InstrDist));
 
       // Save this base index in the maps.
-      mi2iMap.insert(std::make_pair(mi, SlotIndex(back(),
+      mi2iMap.insert(std::make_pair(mi, SlotIndex(&indexList.back(),
                                                   SlotIndex::Slot_Block)));
- 
+
       ++functionSize;
     }
 
     // We insert one blank instructions between basic blocks.
-    push_back(createEntry(0, index += SlotIndex::InstrDist));
+    indexList.push_back(createEntry(0, index += SlotIndex::InstrDist));
 
     MBBRanges[mbb->getNumber()].first = blockStartIndex;
-    MBBRanges[mbb->getNumber()].second = SlotIndex(back(),
+    MBBRanges[mbb->getNumber()].second = SlotIndex(&indexList.back(),
                                                    SlotIndex::Slot_Block);
     idx2MBBMap.push_back(IdxMBBPair(blockStartIndex, mbb));
   }
@@ -119,38 +118,37 @@ void SlotIndexes::renumberIndexes() {
 
   unsigned index = 0;
 
-  for (IndexListEntry *curEntry = front(); curEntry != getTail();
-       curEntry = curEntry->getNext()) {
-    curEntry->setIndex(index);
+  for (IndexList::iterator I = indexList.begin(), E = indexList.end();
+       I != E; ++I) {
+    I->setIndex(index);
     index += SlotIndex::InstrDist;
   }
 }
 
-// Renumber indexes locally after curEntry was inserted, but failed to get a new
+// Renumber indexes locally after curItr was inserted, but failed to get a new
 // index.
-void SlotIndexes::renumberIndexes(IndexListEntry *curEntry) {
+void SlotIndexes::renumberIndexes(IndexList::iterator curItr) {
   // Number indexes with half the default spacing so we can catch up quickly.
   const unsigned Space = SlotIndex::InstrDist/2;
   assert((Space & 3) == 0 && "InstrDist must be a multiple of 2*NUM");
 
-  IndexListEntry *start = curEntry->getPrev();
-  unsigned index = start->getIndex();
-  IndexListEntry *tail = getTail();
+  IndexList::iterator startItr = prior(curItr);
+  unsigned index = startItr->getIndex();
   do {
-    curEntry->setIndex(index += Space);
-    curEntry = curEntry->getNext();
+    curItr->setIndex(index += Space);
+    ++curItr;
     // If the next index is bigger, we have caught up.
-  } while (curEntry != tail && curEntry->getIndex() <= index);
+  } while (curItr != indexList.end() && curItr->getIndex() <= index);
 
-  DEBUG(dbgs() << "\n*** Renumbered SlotIndexes " << start->getIndex() << '-'
+  DEBUG(dbgs() << "\n*** Renumbered SlotIndexes " << startItr->getIndex() << '-'
                << index << " ***\n");
   ++NumLocalRenum;
 }
 
 
 void SlotIndexes::dump() const {
-  for (const IndexListEntry *itr = front(); itr != getTail();
-       itr = itr->getNext()) {
+  for (IndexList::const_iterator itr = indexList.begin();
+       itr != indexList.end(); ++itr) {
     dbgs() << itr->getIndex() << " ";
 
     if (itr->getInstr() != 0) {
@@ -168,7 +166,7 @@ void SlotIndexes::dump() const {
 // Print a SlotIndex to a raw_ostream.
 void SlotIndex::print(raw_ostream &os) const {
   if (isValid())
-    os << entry().getIndex() << "Berd"[getSlot()];
+    os << listEntry()->getIndex() << "Berd"[getSlot()];
   else
     os << "invalid";
 }
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index b72dea7..4cd22eb 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -11,8 +11,8 @@
 
 #include "Spiller.h"
 #include "VirtRegMap.h"
-#include "LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -116,7 +116,7 @@ protected:
       }
 
       // Create a new vreg & interval for this instr.
-      LiveInterval *newLI = &LRE.create(*lis, *vrm);
+      LiveInterval *newLI = &LRE.create();
       newLI->weight = HUGE_VALF;
 
       // Update the reg operands & kill flags.
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index ab9b524..9959f74 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -14,10 +14,10 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "SplitKit.h"
-#include "LiveRangeEdit.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -351,7 +351,7 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
 
   // We don't need an AliasAnalysis since we will only be performing
   // cheap-as-a-copy remats anyway.
-  Edit->anyRematerializable(LIS, TII, 0);
+  Edit->anyRematerializable(0);
 }
 
 void SplitEditor::dump() const {
@@ -436,8 +436,8 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
 
   // Attempt cheap-as-a-copy rematerialization.
   LiveRangeEdit::Remat RM(ParentVNI);
-  if (Edit->canRematerializeAt(RM, UseIdx, true, LIS)) {
-    Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, LIS, TII, TRI, Late);
+  if (Edit->canRematerializeAt(RM, UseIdx, true)) {
+    Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
     ++NumRemats;
   } else {
     // Can't remat, just insert a copy from parent.
@@ -456,11 +456,11 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
 unsigned SplitEditor::openIntv() {
   // Create the complement as index 0.
   if (Edit->empty())
-    Edit->create(LIS, VRM);
+    Edit->create();
 
   // Create the open interval.
   OpenIdx = Edit->size();
-  Edit->create(LIS, VRM);
+  Edit->create();
   return OpenIdx;
 }
 
@@ -1033,7 +1033,7 @@ void SplitEditor::deleteRematVictims() {
   if (Dead.empty())
     return;
 
-  Edit->eliminateDeadDefs(Dead, LIS, VRM, TII);
+  Edit->eliminateDeadDefs(Dead);
 }
 
 void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
@@ -1108,7 +1108,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     SmallVector<LiveInterval*, 8> dups;
     dups.push_back(li);
     for (unsigned j = 1; j != NumComp; ++j)
-      dups.push_back(&Edit->create(LIS, VRM));
+      dups.push_back(&Edit->create());
     ConEQ.Distribute(&dups[0], MRI);
     // The new intervals all map back to i.
     if (LRMap)
@@ -1116,7 +1116,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   }
 
   // Calculate spill weight and allocation hints for new intervals.
-  Edit->calculateRegClassAndHint(VRM.getMachineFunction(), LIS, SA.Loops);
+  Edit->calculateRegClassAndHint(VRM.getMachineFunction(), SA.Loops);
 
   assert(!LRMap || LRMap->size() == Edit->size());
 }
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index be25855..2beb928 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -78,6 +78,9 @@ MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
   unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
   unsigned Reg1 = MI->getOperand(Idx1).getReg();
   unsigned Reg2 = MI->getOperand(Idx2).getReg();
+  unsigned SubReg0 = HasDef ? MI->getOperand(0).getSubReg() : 0;
+  unsigned SubReg1 = MI->getOperand(Idx1).getSubReg();
+  unsigned SubReg2 = MI->getOperand(Idx2).getSubReg();
   bool Reg1IsKill = MI->getOperand(Idx1).isKill();
   bool Reg2IsKill = MI->getOperand(Idx2).isKill();
   // If destination is tied to either of the commuted source register, then
@@ -86,10 +89,12 @@ MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
       MI->getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO) == 0) {
     Reg2IsKill = false;
     Reg0 = Reg2;
+    SubReg0 = SubReg2;
   } else if (HasDef && Reg0 == Reg2 &&
              MI->getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO) == 0) {
     Reg1IsKill = false;
     Reg0 = Reg1;
+    SubReg0 = SubReg1;
   }
 
   if (NewMI) {
@@ -98,19 +103,23 @@ MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
     MachineFunction &MF = *MI->getParent()->getParent();
     if (HasDef)
       return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
-        .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
-        .addReg(Reg2, getKillRegState(Reg2IsKill))
-        .addReg(Reg1, getKillRegState(Reg2IsKill));
+        .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead), SubReg0)
+        .addReg(Reg2, getKillRegState(Reg2IsKill), SubReg2)
+        .addReg(Reg1, getKillRegState(Reg1IsKill), SubReg1);
     else
       return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
-        .addReg(Reg2, getKillRegState(Reg2IsKill))
-        .addReg(Reg1, getKillRegState(Reg2IsKill));
+        .addReg(Reg2, getKillRegState(Reg2IsKill), SubReg2)
+        .addReg(Reg1, getKillRegState(Reg1IsKill), SubReg1);
   }
 
-  if (HasDef)
+  if (HasDef) {
     MI->getOperand(0).setReg(Reg0);
+    MI->getOperand(0).setSubReg(SubReg0);
+  }
   MI->getOperand(Idx2).setReg(Reg1);
   MI->getOperand(Idx1).setReg(Reg2);
+  MI->getOperand(Idx2).setSubReg(SubReg1);
+  MI->getOperand(Idx1).setSubReg(SubReg2);
   MI->getOperand(Idx2).setIsKill(Reg1IsKill);
   MI->getOperand(Idx1).setIsKill(Reg2IsKill);
   return MI;
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 24b8bc2..c30b133 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1183,8 +1183,9 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
 /// TryInstructionTransform - For the case where an instruction has a single
 /// pair of tied register operands, attempt some transformations that may
 /// either eliminate the tied operands or improve the opportunities for
-/// coalescing away the register copy.  Returns true if the tied operands
-/// are eliminated altogether.
+/// coalescing away the register copy.  Returns true if no copy needs to be
+/// inserted to untie mi's operands (either because they were untied, or
+/// because mi was rescheduled, and will be visited again later).
 bool TwoAddressInstructionPass::
 TryInstructionTransform(MachineBasicBlock::iterator &mi,
                         MachineBasicBlock::iterator &nmi,
@@ -1380,7 +1381,6 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
 /// runOnMachineFunction - Reduce two-address instructions to two operands.
 ///
 bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "Machine Function\n");
   const TargetMachine &TM = MF.getTarget();
   MRI = &MF.getRegInfo();
   TII = TM.getInstrInfo();
@@ -1595,19 +1595,19 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
         MadeChange = true;
 
         DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
-      }
 
-      // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
-      if (mi->isInsertSubreg()) {
-        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
-        // To   %reg:subidx = COPY %subreg
-        unsigned SubIdx = mi->getOperand(3).getImm();
-        mi->RemoveOperand(3);
-        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
-        mi->getOperand(0).setSubReg(SubIdx);
-        mi->RemoveOperand(1);
-        mi->setDesc(TII->get(TargetOpcode::COPY));
-        DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+        // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
+        if (mi->isInsertSubreg()) {
+          // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+          // To   %reg:subidx = COPY %subreg
+          unsigned SubIdx = mi->getOperand(3).getImm();
+          mi->RemoveOperand(3);
+          assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+          mi->getOperand(0).setSubReg(SubIdx);
+          mi->RemoveOperand(1);
+          mi->setDesc(TII->get(TargetOpcode::COPY));
+          DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+        }
       }
 
       // Clear TiedOperands here instead of at the top of the loop
@@ -1833,6 +1833,7 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
     SmallSet<unsigned, 4> Seen;
     for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
       unsigned SrcReg = MI->getOperand(i).getReg();
+      unsigned SrcSubIdx = MI->getOperand(i).getSubReg();
       unsigned SubIdx = MI->getOperand(i+1).getImm();
       // DefMI of NULL means the value does not have a vreg in this block
       // i.e., its a physical register or a subreg.
@@ -1888,7 +1889,7 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
         MachineInstr *CopyMI = BuildMI(*MI->getParent(), InsertLoc,
                                 MI->getDebugLoc(), TII->get(TargetOpcode::COPY))
             .addReg(DstReg, RegState::Define, SubIdx)
-            .addReg(SrcReg, getKillRegState(isKill));
+            .addReg(SrcReg, getKillRegState(isKill), SrcSubIdx);
         MI->getOperand(i).setReg(0);
         if (LV && isKill && !TargetRegisterInfo::isPhysicalRegister(SrcReg))
           LV->replaceKillInstruction(SrcReg, MI, CopyMI);
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index 1c7b9d7..ee2a3ab 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp
@@ -348,7 +348,7 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
   }
 
   if (cu_relative_offset)
-    OS << format(" => {0x%8.8"PRIx64"}", (uvalue + (cu ? cu->getOffset() : 0)));
+    OS << format(" => {0x%8.8" PRIx64 "}", uvalue + (cu ? cu->getOffset() : 0));
 }
 
 const char*
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 2890174..a744d0c 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -402,14 +402,15 @@ ExecutionEngine *ExecutionEngine::create(Module *M,
                                          std::string *ErrorStr,
                                          CodeGenOpt::Level OptLevel,
                                          bool GVsWithCode) {
-  return EngineBuilder(M)
+  EngineBuilder EB =  EngineBuilder(M)
       .setEngineKind(ForceInterpreter
                      ? EngineKind::Interpreter
                      : EngineKind::JIT)
       .setErrorStr(ErrorStr)
       .setOptLevel(OptLevel)
-      .setAllocateGVsWithCode(GVsWithCode)
-      .create();
+      .setAllocateGVsWithCode(GVsWithCode);
+
+  return EB.create();
 }
 
 /// createJIT - This is the factory method for creating a JIT for the current
@@ -430,21 +431,25 @@ ExecutionEngine *ExecutionEngine::createJIT(Module *M,
 
   // Use the defaults for extra parameters.  Users can use EngineBuilder to
   // set them.
-  StringRef MArch = "";
-  StringRef MCPU = "";
-  SmallVector<std::string, 1> MAttrs;
+  EngineBuilder EB(M);
+  EB.setEngineKind(EngineKind::JIT);
+  EB.setErrorStr(ErrorStr);
+  EB.setRelocationModel(RM);
+  EB.setCodeModel(CMM);
+  EB.setAllocateGVsWithCode(GVsWithCode);
+  EB.setOptLevel(OL);
+  EB.setJITMemoryManager(JMM);
 
-  Triple TT(M->getTargetTriple());
   // TODO: permit custom TargetOptions here
-  TargetMachine *TM =
-    EngineBuilder::selectTarget(TT, MArch, MCPU, MAttrs, TargetOptions(), RM,
-                                CMM, OL, ErrorStr);
+  TargetMachine *TM = EB.selectTarget();
   if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
 
   return ExecutionEngine::JITCtor(M, ErrorStr, JMM, GVsWithCode, TM);
 }
 
-ExecutionEngine *EngineBuilder::create() {
+ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
+  OwningPtr<TargetMachine> TheTM(TM); // Take ownership.
+
   // Make sure we can resolve symbols in the program as well. The zero arg
   // to the function tells DynamicLibrary to load the program, not a library.
   if (sys::DynamicLibrary::LoadLibraryPermanently(0, ErrorStr))
@@ -465,29 +470,24 @@ ExecutionEngine *EngineBuilder::create() {
 
   // Unless the interpreter was explicitly selected or the JIT is not linked,
   // try making a JIT.
-  if (WhichEngine & EngineKind::JIT) {
+  if ((WhichEngine & EngineKind::JIT) && TheTM) {
     Triple TT(M->getTargetTriple());
-    if (TargetMachine *TM = EngineBuilder::selectTarget(TT, MArch, MCPU, MAttrs,
-                                                        Options,
-                                                        RelocModel, CMModel,
-                                                        OptLevel, ErrorStr)) {
-      if (!TM->getTarget().hasJIT()) {
-        errs() << "WARNING: This target JIT is not designed for the host"
-               << " you are running.  If bad things happen, please choose"
-               << " a different -march switch.\n";
-      }
+    if (!TM->getTarget().hasJIT()) {
+      errs() << "WARNING: This target JIT is not designed for the host"
+             << " you are running.  If bad things happen, please choose"
+             << " a different -march switch.\n";
+    }
 
-      if (UseMCJIT && ExecutionEngine::MCJITCtor) {
-        ExecutionEngine *EE =
-          ExecutionEngine::MCJITCtor(M, ErrorStr, JMM,
-                                     AllocateGVsWithCode, TM);
-        if (EE) return EE;
-      } else if (ExecutionEngine::JITCtor) {
-        ExecutionEngine *EE =
-          ExecutionEngine::JITCtor(M, ErrorStr, JMM,
-                                   AllocateGVsWithCode, TM);
-        if (EE) return EE;
-      }
+    if (UseMCJIT && ExecutionEngine::MCJITCtor) {
+      ExecutionEngine *EE =
+        ExecutionEngine::MCJITCtor(M, ErrorStr, JMM,
+                                   AllocateGVsWithCode, TheTM.take());
+      if (EE) return EE;
+    } else if (ExecutionEngine::JITCtor) {
+      ExecutionEngine *EE =
+        ExecutionEngine::JITCtor(M, ErrorStr, JMM,
+                                 AllocateGVsWithCode, TheTM.take());
+      if (EE) return EE;
     }
   }
 
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 16b8ee2..a942299 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -269,7 +269,8 @@ extern "C" {
 
 JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
          JITMemoryManager *jmm, bool GVsWithCode)
-  : ExecutionEngine(M), TM(tm), TJI(tji), JMM(jmm),
+  : ExecutionEngine(M), TM(tm), TJI(tji),
+    JMM(jmm ? jmm : JITMemoryManager::CreateDefaultMemManager()),
     AllocateGVsWithCode(GVsWithCode), isAlreadyCodeGenerating(false) {
   setTargetData(TM.getTargetData());
 
@@ -323,6 +324,7 @@ JIT::~JIT() {
   AllJits->Remove(this);
   delete jitstate;
   delete JCE;
+  // JMM is a ownership of JCE, so we no need delete JMM here.
   delete &TM;
 }
 
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index c557981..2ae155b 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -118,7 +118,7 @@ public:
                                    const std::vector<GenericValue> &ArgValues);
 
   /// getPointerToNamedFunction - This method returns the address of the
-  /// specified function by using the dlsym function call.  As such it is only
+  /// specified function by using the MemoryManager. As such it is only
   /// useful for resolving library symbols, not code generated symbols.
   ///
   /// If AbortOnFailure is false and no function with the given name is
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index d404d0c..2d1775c 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -23,10 +23,22 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Memory.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Config/config.h"
 #include <vector>
 #include <cassert>
 #include <climits>
 #include <cstring>
+
+#if defined(__linux__)
+#if defined(HAVE_SYS_STAT_H)
+#include <sys/stat.h>
+#endif
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
 using namespace llvm;
 
 STATISTIC(NumSlabs, "Number of slabs of memory allocated by the JIT");
@@ -315,13 +327,7 @@ namespace {
     static const size_t DefaultSizeThreshold;
 
     /// getPointerToNamedFunction - This method returns the address of the
-    /// specified function by using the dlsym function call.  As such it is only
-    /// useful for resolving library symbols, not code generated symbols.
-    ///
-    /// If AbortOnFailure is false and no function with the given name is
-    /// found, this function silently returns a null pointer. Otherwise,
-    /// it prints a message to stderr and aborts.
-    ///
+    /// specified function by using the dlsym function call.
     virtual void *getPointerToNamedFunction(const std::string &Name,
                                             bool AbortOnFailure = true);
 
@@ -771,9 +777,6 @@ bool DefaultJITMemoryManager::CheckInvariants(std::string &ErrorStr) {
 //===----------------------------------------------------------------------===//
 // getPointerToNamedFunction() implementation.
 //===----------------------------------------------------------------------===//
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Config/config.h"
 
 // AtExitHandlers - List of functions to call when the program exits,
 // registered with the atexit() library function.
@@ -793,8 +796,7 @@ static void runAtExitHandlers() {
 
 //===----------------------------------------------------------------------===//
 // Function stubs that are invoked instead of certain library calls
-//===----------------------------------------------------------------------===//
-
+//
 // Force the following functions to be linked in to anything that uses the
 // JIT. This is a hack designed to work around the all-too-clever Glibc
 // strategy of making these functions work differently when inlined vs. when
@@ -802,11 +804,6 @@ static void runAtExitHandlers() {
 // that the dynamic linker can't see. For more info, search for
 // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
 #if defined(__linux__)
-#if defined(HAVE_SYS_STAT_H)
-#include <sys/stat.h>
-#endif
-#include <fcntl.h>
-#include <unistd.h>
 /* stat functions are redirecting to __xstat with a version number.  On x86-64
  * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
  * available as an exported symbol, so we have to add it explicitly.
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index cbb23d3..44f89cf 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -74,9 +74,9 @@ MCJIT::MCJIT(Module *m, TargetMachine *tm, TargetJITInfo &tji,
   OS.flush();
 
   // Load the object into the dynamic linker.
-  // FIXME: It would be nice to avoid making yet another copy.
-  MemoryBuffer *MB = MemoryBuffer::getMemBufferCopy(StringRef(Buffer.data(),
-                                                              Buffer.size()));
+  MemoryBuffer *MB = MemoryBuffer::getMemBuffer(StringRef(Buffer.data(),
+                                                          Buffer.size()),
+                                                "", false);
   if (Dyld.loadObject(MB))
     report_fatal_error(Dyld.getErrorString());
   // Resolve any relocations.
@@ -218,7 +218,7 @@ GenericValue MCJIT::runFunction(Function *F,
 
 void *MCJIT::getPointerToNamedFunction(const std::string &Name,
                                        bool AbortOnFailure){
-  if (!isSymbolSearchingDisabled()) {
+  if (!isSymbolSearchingDisabled() && MemMgr) {
     void *ptr = MemMgr->getPointerToNamedFunction(Name, false);
     if (ptr)
       return ptr;
diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
index dac8b26..a68949a 100644
--- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
+++ b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
@@ -27,7 +27,8 @@ class MCJITMemoryManager : public RTDyldMemoryManager {
   // FIXME: Multiple modules.
   Module *M;
 public:
-  MCJITMemoryManager(JITMemoryManager *jmm, Module *m) : JMM(jmm), M(m) {}
+  MCJITMemoryManager(JITMemoryManager *jmm, Module *m) :
+    JMM(jmm?jmm:JITMemoryManager::CreateDefaultMemManager()), M(m) {}
   // We own the JMM, so make sure to delete it.
   ~MCJITMemoryManager() { delete JMM; }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
index 002e63c..cbf7cf1 100644
--- a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
+++ b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMRuntimeDyld
+  GDBRegistrar.cpp
   RuntimeDyld.cpp
-  RuntimeDyldMachO.cpp
   RuntimeDyldELF.cpp
+  RuntimeDyldMachO.cpp
   )
diff --git a/lib/ExecutionEngine/RuntimeDyld/DyldELFObject.h b/lib/ExecutionEngine/RuntimeDyld/DyldELFObject.h
deleted file mode 100644
index 2d777da..0000000
--- a/lib/ExecutionEngine/RuntimeDyld/DyldELFObject.h
+++ /dev/null
@@ -1,388 +0,0 @@
-//===-- DyldELFObject.h - Dynamically loaded ELF object  ----0---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Dynamically loaded ELF object class, a subclass of ELFObjectFile. Used
-// to represent a loadable ELF image.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_RUNTIMEDYLD_DYLDELFOBJECT_H
-#define LLVM_RUNTIMEDYLD_DYLDELFOBJECT_H
-
-#include "llvm/Object/ELF.h"
-
-
-namespace llvm {
-
-using support::endianness;
-using namespace llvm::object;
-
-template<support::endianness target_endianness, bool is64Bits>
-class DyldELFObject : public ELFObjectFile<target_endianness, is64Bits> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
-
-  typedef Elf_Shdr_Impl<target_endianness, is64Bits> Elf_Shdr;
-  typedef Elf_Sym_Impl<target_endianness, is64Bits> Elf_Sym;
-  typedef Elf_Rel_Impl<target_endianness, is64Bits, false> Elf_Rel;
-  typedef Elf_Rel_Impl<target_endianness, is64Bits, true> Elf_Rela;
-
-  typedef typename ELFObjectFile<target_endianness, is64Bits>::
-    Elf_Ehdr Elf_Ehdr;
-  Elf_Ehdr *Header;
-
-  // Update section headers according to the current location in memory
-  virtual void rebaseObject(std::vector<uint8_t*> *MemoryMap);
-  // Record memory addresses for cleanup
-  virtual void saveAddress(std::vector<uint8_t*> *MemoryMap, uint8_t *addr);
-
-protected:
-  virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const;
-
-public:
-  DyldELFObject(MemoryBuffer *Object, std::vector<uint8_t*> *MemoryMap,
-                error_code &ec);
-
-  // Methods for type inquiry through isa, cast, and dyn_cast
-  static inline bool classof(const Binary *v) {
-    return (isa<ELFObjectFile<target_endianness, is64Bits> >(v)
-            && classof(cast<ELFObjectFile<target_endianness, is64Bits> >(v)));
-  }
-  static inline bool classof(
-      const ELFObjectFile<target_endianness, is64Bits> *v) {
-    return v->isDyldType();
-  }
-  static inline bool classof(const DyldELFObject *v) {
-    return true;
-  }
-};
-
-template<support::endianness target_endianness, bool is64Bits>
-DyldELFObject<target_endianness, is64Bits>::DyldELFObject(MemoryBuffer *Object,
-      std::vector<uint8_t*> *MemoryMap, error_code &ec)
-  : ELFObjectFile<target_endianness, is64Bits>(Object, ec)
-  , Header(0) {
-  this->isDyldELFObject = true;
-  Header = const_cast<Elf_Ehdr *>(
-      reinterpret_cast<const Elf_Ehdr *>(this->base()));
-  if (Header->e_shoff == 0)
-    return;
-
-  // Mark the image as a dynamic shared library
-  Header->e_type = ELF::ET_DYN;
-
-  rebaseObject(MemoryMap);
-}
-
-// Walk through the ELF headers, updating virtual addresses to reflect where
-// the object is currently loaded in memory
-template<support::endianness target_endianness, bool is64Bits>
-void DyldELFObject<target_endianness, is64Bits>::rebaseObject(
-    std::vector<uint8_t*> *MemoryMap) {
-  typedef typename ELFDataTypeTypedefHelper<
-          target_endianness, is64Bits>::value_type addr_type;
-
-  uint8_t *base_p = const_cast<uint8_t *>(this->base());
-  Elf_Shdr *sectionTable =
-      reinterpret_cast<Elf_Shdr *>(base_p + Header->e_shoff);
-  uint64_t numSections = this->getNumSections();
-
-  // Allocate memory space for NOBITS sections (such as .bss), which only exist
-  // in memory, but don't occupy space in the object file.
-  // Update the address in the section headers to reflect this allocation.
-  for (uint64_t index = 0; index < numSections; index++) {
-    Elf_Shdr *sec = reinterpret_cast<Elf_Shdr *>(
-        reinterpret_cast<char *>(sectionTable) + index * Header->e_shentsize);
-
-    // Only update sections that are meant to be present in program memory
-    if (sec->sh_flags & ELF::SHF_ALLOC) {
-      uint8_t *addr = base_p + sec->sh_offset;
-      if (sec->sh_type == ELF::SHT_NOBITS) {
-        addr = static_cast<uint8_t *>(calloc(sec->sh_size, 1));
-        saveAddress(MemoryMap, addr);
-      }
-      else {
-        // FIXME: Currently memory with RWX permissions is allocated. In the
-        // future, make sure that permissions are as necessary
-        if (sec->sh_flags & ELF::SHF_WRITE) {
-            // see FIXME above
-        }
-        if (sec->sh_flags & ELF::SHF_EXECINSTR) {
-            // see FIXME above
-        }
-      }
-      assert(sizeof(addr_type) == sizeof(intptr_t) &&
-             "Cross-architecture ELF dy-load is not supported!");
-      sec->sh_addr = static_cast<addr_type>(intptr_t(addr));
-    }
-  }
-
-  // Now allocate actual space for COMMON symbols, which also don't occupy
-  // space in the object file.
-  // We want to allocate space for all COMMON symbols at once, so the flow is:
-  // 1. Go over all symbols, find those that are in COMMON. For each such
-  //    symbol, record its size and the value field in its symbol header in a
-  //    special vector.
-  // 2. Allocate memory for all COMMON symbols in one fell swoop.
-  // 3. Using the recorded information from (1), update the address fields in
-  //    the symbol headers of the COMMON symbols to reflect their allocated
-  //    address.
-  uint64_t TotalSize = 0;
-  std::vector<std::pair<Elf_Addr *, uint64_t> > SymbAddrInfo;
-  error_code ec = object_error::success;
-  for (symbol_iterator si = this->begin_symbols(),
-       se = this->end_symbols(); si != se; si.increment(ec)) {
-    uint64_t Size = 0;
-    ec = si->getSize(Size);
-    Elf_Sym* symb = const_cast<Elf_Sym*>(
-        this->getSymbol(si->getRawDataRefImpl()));
-    if (ec == object_error::success &&
-        this->getSymbolTableIndex(symb) == ELF::SHN_COMMON && Size > 0) {
-      SymbAddrInfo.push_back(std::make_pair(&(symb->st_value), Size));
-      TotalSize += Size;
-    }
-  }
-
-  uint8_t* SectionPtr = (uint8_t *)calloc(TotalSize, 1);
-  saveAddress(MemoryMap, SectionPtr);
-
-  typedef typename std::vector<std::pair<Elf_Addr *, uint64_t> >::iterator
-      AddrInfoIterator;
-  AddrInfoIterator EndIter = SymbAddrInfo.end();
-  for (AddrInfoIterator AddrIter = SymbAddrInfo.begin();
-       AddrIter != EndIter; ++AddrIter) {
-    assert(sizeof(addr_type) == sizeof(intptr_t) &&
-           "Cross-architecture ELF dy-load is not supported!");
-    *(AddrIter->first) = static_cast<addr_type>(intptr_t(SectionPtr));
-    SectionPtr += AddrIter->second;
-  }
-}
-
-// Record memory addresses for callers
-template<support::endianness target_endianness, bool is64Bits>
-void DyldELFObject<target_endianness, is64Bits>::saveAddress(
-    std::vector<uint8_t*> *MemoryMap, uint8_t* addr) {
-  if (MemoryMap)
-    MemoryMap->push_back(addr);
-  else
-    errs() << "WARNING: Memory leak - cannot record memory for ELF dyld.";
-}
-
-template<support::endianness target_endianness, bool is64Bits>
-error_code DyldELFObject<target_endianness, is64Bits>::getSymbolAddress(
-    DataRefImpl Symb, uint64_t &Result) const {
-  this->validateSymbol(Symb);
-  const Elf_Sym *symb = this->getSymbol(Symb);
-  if (this->getSymbolTableIndex(symb) == ELF::SHN_COMMON) {
-    Result = symb->st_value;
-    return object_error::success;
-  }
-  else {
-    return ELFObjectFile<target_endianness, is64Bits>::getSymbolAddress(
-        Symb, Result);
-  }
-}
-
-}
-
-#endif
-
-//===-- DyldELFObject.h - Dynamically loaded ELF object  ----0---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Dynamically loaded ELF object class, a subclass of ELFObjectFile. Used
-// to represent a loadable ELF image.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_RUNTIMEDYLD_DYLDELFOBJECT_H
-#define LLVM_RUNTIMEDYLD_DYLDELFOBJECT_H
-
-#include "llvm/Object/ELF.h"
-
-
-namespace llvm {
-
-using support::endianness;
-using namespace llvm::object;
-
-template<support::endianness target_endianness, bool is64Bits>
-class DyldELFObject : public ELFObjectFile<target_endianness, is64Bits> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
-
-  typedef Elf_Shdr_Impl<target_endianness, is64Bits> Elf_Shdr;
-  typedef Elf_Sym_Impl<target_endianness, is64Bits> Elf_Sym;
-  typedef Elf_Rel_Impl<target_endianness, is64Bits, false> Elf_Rel;
-  typedef Elf_Rel_Impl<target_endianness, is64Bits, true> Elf_Rela;
-
-  typedef typename ELFObjectFile<target_endianness, is64Bits>::
-    Elf_Ehdr Elf_Ehdr;
-  Elf_Ehdr *Header;
-
-  // Update section headers according to the current location in memory
-  virtual void rebaseObject(std::vector<uint8_t*> *MemoryMap);
-  // Record memory addresses for cleanup
-  virtual void saveAddress(std::vector<uint8_t*> *MemoryMap, uint8_t *addr);
-
-protected:
-  virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const;
-
-public:
-  DyldELFObject(MemoryBuffer *Object, std::vector<uint8_t*> *MemoryMap,
-                error_code &ec);
-
-  // Methods for type inquiry through isa, cast, and dyn_cast
-  static inline bool classof(const Binary *v) {
-    return (isa<ELFObjectFile<target_endianness, is64Bits> >(v)
-            && classof(cast<ELFObjectFile<target_endianness, is64Bits> >(v)));
-  }
-  static inline bool classof(
-      const ELFObjectFile<target_endianness, is64Bits> *v) {
-    return v->isDyldType();
-  }
-  static inline bool classof(const DyldELFObject *v) {
-    return true;
-  }
-};
-
-template<support::endianness target_endianness, bool is64Bits>
-DyldELFObject<target_endianness, is64Bits>::DyldELFObject(MemoryBuffer *Object,
-      std::vector<uint8_t*> *MemoryMap, error_code &ec)
-  : ELFObjectFile<target_endianness, is64Bits>(Object, ec)
-  , Header(0) {
-  this->isDyldELFObject = true;
-  Header = const_cast<Elf_Ehdr *>(
-      reinterpret_cast<const Elf_Ehdr *>(this->base()));
-  if (Header->e_shoff == 0)
-    return;
-
-  // Mark the image as a dynamic shared library
-  Header->e_type = ELF::ET_DYN;
-
-  rebaseObject(MemoryMap);
-}
-
-// Walk through the ELF headers, updating virtual addresses to reflect where
-// the object is currently loaded in memory
-template<support::endianness target_endianness, bool is64Bits>
-void DyldELFObject<target_endianness, is64Bits>::rebaseObject(
-    std::vector<uint8_t*> *MemoryMap) {
-  typedef typename ELFDataTypeTypedefHelper<
-          target_endianness, is64Bits>::value_type addr_type;
-
-  uint8_t *base_p = const_cast<uint8_t *>(this->base());
-  Elf_Shdr *sectionTable =
-      reinterpret_cast<Elf_Shdr *>(base_p + Header->e_shoff);
-  uint64_t numSections = this->getNumSections();
-
-  // Allocate memory space for NOBITS sections (such as .bss), which only exist
-  // in memory, but don't occupy space in the object file.
-  // Update the address in the section headers to reflect this allocation.
-  for (uint64_t index = 0; index < numSections; index++) {
-    Elf_Shdr *sec = reinterpret_cast<Elf_Shdr *>(
-        reinterpret_cast<char *>(sectionTable) + index * Header->e_shentsize);
-
-    // Only update sections that are meant to be present in program memory
-    if (sec->sh_flags & ELF::SHF_ALLOC) {
-      uint8_t *addr = base_p + sec->sh_offset;
-      if (sec->sh_type == ELF::SHT_NOBITS) {
-        addr = static_cast<uint8_t *>(calloc(sec->sh_size, 1));
-        saveAddress(MemoryMap, addr);
-      }
-      else {
-        // FIXME: Currently memory with RWX permissions is allocated. In the
-        // future, make sure that permissions are as necessary
-        if (sec->sh_flags & ELF::SHF_WRITE) {
-            // see FIXME above
-        }
-        if (sec->sh_flags & ELF::SHF_EXECINSTR) {
-            // see FIXME above
-        }
-      }
-      assert(sizeof(addr_type) == sizeof(intptr_t) &&
-             "Cross-architecture ELF dy-load is not supported!");
-      sec->sh_addr = static_cast<addr_type>(intptr_t(addr));
-    }
-  }
-
-  // Now allocate actual space for COMMON symbols, which also don't occupy
-  // space in the object file.
-  // We want to allocate space for all COMMON symbols at once, so the flow is:
-  // 1. Go over all symbols, find those that are in COMMON. For each such
-  //    symbol, record its size and the value field in its symbol header in a
-  //    special vector.
-  // 2. Allocate memory for all COMMON symbols in one fell swoop.
-  // 3. Using the recorded information from (1), update the address fields in
-  //    the symbol headers of the COMMON symbols to reflect their allocated
-  //    address.
-  uint64_t TotalSize = 0;
-  std::vector<std::pair<Elf_Addr *, uint64_t> > SymbAddrInfo;
-  error_code ec = object_error::success;
-  for (symbol_iterator si = this->begin_symbols(),
-       se = this->end_symbols(); si != se; si.increment(ec)) {
-    uint64_t Size = 0;
-    ec = si->getSize(Size);
-    Elf_Sym* symb = const_cast<Elf_Sym*>(
-        this->getSymbol(si->getRawDataRefImpl()));
-    if (ec == object_error::success &&
-        this->getSymbolTableIndex(symb) == ELF::SHN_COMMON && Size > 0) {
-      SymbAddrInfo.push_back(std::make_pair(&(symb->st_value), Size));
-      TotalSize += Size;
-    }
-  }
-
-  uint8_t* SectionPtr = (uint8_t *)calloc(TotalSize, 1);
-  saveAddress(MemoryMap, SectionPtr);
-
-  typedef typename std::vector<std::pair<Elf_Addr *, uint64_t> >::iterator
-      AddrInfoIterator;
-  AddrInfoIterator EndIter = SymbAddrInfo.end();
-  for (AddrInfoIterator AddrIter = SymbAddrInfo.begin();
-       AddrIter != EndIter; ++AddrIter) {
-    assert(sizeof(addr_type) == sizeof(intptr_t) &&
-           "Cross-architecture ELF dy-load is not supported!");
-    *(AddrIter->first) = static_cast<addr_type>(intptr_t(SectionPtr));
-    SectionPtr += AddrIter->second;
-  }
-}
-
-// Record memory addresses for callers
-template<support::endianness target_endianness, bool is64Bits>
-void DyldELFObject<target_endianness, is64Bits>::saveAddress(
-    std::vector<uint8_t*> *MemoryMap, uint8_t* addr) {
-  if (MemoryMap)
-    MemoryMap->push_back(addr);
-  else
-    errs() << "WARNING: Memory leak - cannot record memory for ELF dyld.";
-}
-
-template<support::endianness target_endianness, bool is64Bits>
-error_code DyldELFObject<target_endianness, is64Bits>::getSymbolAddress(
-    DataRefImpl Symb, uint64_t &Result) const {
-  this->validateSymbol(Symb);
-  const Elf_Sym *symb = this->getSymbol(Symb);
-  if (this->getSymbolTableIndex(symb) == ELF::SHN_COMMON) {
-    Result = symb->st_value;
-    return object_error::success;
-  }
-  else {
-    return ELFObjectFile<target_endianness, is64Bits>::getSymbolAddress(
-        Symb, Result);
-  }
-}
-
-}
-
-#endif
-
diff --git a/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp b/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
new file mode 100644
index 0000000..8b50101
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
@@ -0,0 +1,214 @@
+//===-- GDBRegistrar.cpp - Registers objects with GDB ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JITRegistrar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Compiler.h"
+
+using namespace llvm;
+
+// This must be kept in sync with gdb/gdb/jit.h .
+extern "C" {
+
+  typedef enum {
+    JIT_NOACTION = 0,
+    JIT_REGISTER_FN,
+    JIT_UNREGISTER_FN
+  } jit_actions_t;
+
+  struct jit_code_entry {
+    struct jit_code_entry *next_entry;
+    struct jit_code_entry *prev_entry;
+    const char *symfile_addr;
+    uint64_t symfile_size;
+  };
+
+  struct jit_descriptor {
+    uint32_t version;
+    // This should be jit_actions_t, but we want to be specific about the
+    // bit-width.
+    uint32_t action_flag;
+    struct jit_code_entry *relevant_entry;
+    struct jit_code_entry *first_entry;
+  };
+
+  // We put information about the JITed function in this global, which the
+  // debugger reads.  Make sure to specify the version statically, because the
+  // debugger checks the version before we can set it during runtime.
+  static struct jit_descriptor __jit_debug_descriptor = { 1, 0, 0, 0 };
+
+  // Debuggers puts a breakpoint in this function.
+  LLVM_ATTRIBUTE_NOINLINE void __jit_debug_register_code() { }
+
+}
+
+namespace {
+
+// Buffer for an in-memory object file in executable memory
+typedef llvm::DenseMap< const char*,
+                        std::pair<std::size_t, jit_code_entry*> >
+  RegisteredObjectBufferMap;
+
+/// Global access point for the JIT debugging interface designed for use with a
+/// singleton toolbox. Handles thread-safe registration and deregistration of
+/// object files that are in executable memory managed by the client of this
+/// class.
+class GDBJITRegistrar : public JITRegistrar {
+  /// A map of in-memory object files that have been registered with the
+  /// JIT interface.
+  RegisteredObjectBufferMap ObjectBufferMap;
+
+public:
+  /// Instantiates the JIT service.
+  GDBJITRegistrar() : ObjectBufferMap() {}
+
+  /// Unregisters each object that was previously registered and releases all
+  /// internal resources.
+  virtual ~GDBJITRegistrar();
+
+  /// Creates an entry in the JIT registry for the buffer @p Object,
+  /// which must contain an object file in executable memory with any
+  /// debug information for the debugger.
+  void registerObject(const MemoryBuffer &Object);
+
+  /// Removes the internal registration of @p Object, and
+  /// frees associated resources.
+  /// Returns true if @p Object was found in ObjectBufferMap.
+  bool deregisterObject(const MemoryBuffer &Object);
+
+private:
+  /// Deregister the debug info for the given object file from the debugger
+  /// and delete any temporary copies.  This private method does not remove
+  /// the function from Map so that it can be called while iterating over Map.
+  void deregisterObjectInternal(RegisteredObjectBufferMap::iterator I);
+};
+
+/// Lock used to serialize all jit registration events, since they
+/// modify global variables.
+llvm::sys::Mutex JITDebugLock;
+
+/// Acquire the lock and do the registration.
+void NotifyDebugger(jit_code_entry* JITCodeEntry) {
+  llvm::MutexGuard locked(JITDebugLock);
+  __jit_debug_descriptor.action_flag = JIT_REGISTER_FN;
+
+  // Insert this entry at the head of the list.
+  JITCodeEntry->prev_entry = NULL;
+  jit_code_entry* NextEntry = __jit_debug_descriptor.first_entry;
+  JITCodeEntry->next_entry = NextEntry;
+  if (NextEntry != NULL) {
+    NextEntry->prev_entry = JITCodeEntry;
+  }
+  __jit_debug_descriptor.first_entry = JITCodeEntry;
+  __jit_debug_descriptor.relevant_entry = JITCodeEntry;
+  __jit_debug_register_code();
+}
+
+GDBJITRegistrar::~GDBJITRegistrar() {
+  // Free all registered object files.
+ for (RegisteredObjectBufferMap::iterator I = ObjectBufferMap.begin(), E = ObjectBufferMap.end();
+       I != E; ++I) {
+    // Call the private method that doesn't update the map so our iterator
+    // doesn't break.
+    deregisterObjectInternal(I);
+  }
+  ObjectBufferMap.clear();
+}
+
+void GDBJITRegistrar::registerObject(const MemoryBuffer &Object) {
+
+  const char *Buffer = Object.getBufferStart();
+  size_t      Size = Object.getBufferSize();
+
+  assert(Buffer && "Attempt to register a null object with a debugger.");
+  assert(ObjectBufferMap.find(Buffer) == ObjectBufferMap.end() &&
+         "Second attempt to perform debug registration.");
+  jit_code_entry* JITCodeEntry = new jit_code_entry();
+
+  if (JITCodeEntry == 0) {
+    llvm::report_fatal_error(
+      "Allocation failed when registering a JIT entry!\n");
+  }
+  else {
+    JITCodeEntry->symfile_addr = Buffer;
+    JITCodeEntry->symfile_size = Size;
+
+    ObjectBufferMap[Buffer] = std::make_pair(Size, JITCodeEntry);
+    NotifyDebugger(JITCodeEntry);
+  }
+}
+
+bool GDBJITRegistrar::deregisterObject(const MemoryBuffer& Object) {
+  const char *Buffer = Object.getBufferStart();
+  RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(Buffer);
+
+  if (I != ObjectBufferMap.end()) {
+    deregisterObjectInternal(I);
+    ObjectBufferMap.erase(I);
+    return true;
+  }
+  return false;
+}
+
+void GDBJITRegistrar::deregisterObjectInternal(
+    RegisteredObjectBufferMap::iterator I) {
+
+  jit_code_entry*& JITCodeEntry = I->second.second;
+
+  // Acquire the lock and do the unregistration.
+  {
+    llvm::MutexGuard locked(JITDebugLock);
+    __jit_debug_descriptor.action_flag = JIT_UNREGISTER_FN;
+
+    // Remove the jit_code_entry from the linked list.
+    jit_code_entry* PrevEntry = JITCodeEntry->prev_entry;
+    jit_code_entry* NextEntry = JITCodeEntry->next_entry;
+
+    if (NextEntry) {
+      NextEntry->prev_entry = PrevEntry;
+    }
+    if (PrevEntry) {
+      PrevEntry->next_entry = NextEntry;
+    }
+    else {
+      assert(__jit_debug_descriptor.first_entry == JITCodeEntry);
+      __jit_debug_descriptor.first_entry = NextEntry;
+    }
+
+    // Tell the debugger which entry we removed, and unregister the code.
+    __jit_debug_descriptor.relevant_entry = JITCodeEntry;
+    __jit_debug_register_code();
+  }
+
+  delete JITCodeEntry;
+  JITCodeEntry = NULL;
+}
+
+} // end namespace
+
+namespace llvm {
+
+JITRegistrar& JITRegistrar::getGDBRegistrar() {
+  static GDBJITRegistrar* sRegistrar = NULL;
+  if (sRegistrar == NULL) {
+    // The mutex is here so that it won't slow down access once the registrar
+    // is instantiated
+    llvm::MutexGuard locked(JITDebugLock);
+    // Check again to be sure another thread didn't create this while we waited
+    if (sRegistrar == NULL) {
+      sRegistrar = new GDBJITRegistrar;
+    }
+  }
+  return *sRegistrar;
+}
+
+} // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
new file mode 100644
index 0000000..f964bc6
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
@@ -0,0 +1,43 @@
+//===-- JITRegistrar.h - Registers objects with a debugger ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
+#define LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
+
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+
+/// Global access point for the JIT debugging interface.
+class JITRegistrar {
+public:
+  /// Instantiates the JIT service.
+  JITRegistrar() {}
+
+  /// Unregisters each object that was previously registered and releases all
+  /// internal resources.
+  virtual ~JITRegistrar() {}
+
+  /// Creates an entry in the JIT registry for the buffer @p Object,
+  /// which must contain an object file in executable memory with any
+  /// debug information for the debugger.
+  virtual void registerObject(const MemoryBuffer &Object) = 0;
+
+  /// Removes the internal registration of @p Object, and
+  /// frees associated resources.
+  /// Returns true if @p Object was previously registered.
+  virtual bool deregisterObject(const MemoryBuffer &Object) = 0;
+
+  /// Returns a reference to a GDB JIT registrar singleton
+  static JITRegistrar& getGDBRegistrar();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
new file mode 100644
index 0000000..8206ead
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
@@ -0,0 +1,59 @@
+//===---- ObjectImage.h - Format independent executuable object image -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a file format independent ObjectImage class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
+#define LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
+
+#include "llvm/Object/ObjectFile.h"
+
+namespace llvm {
+
+class ObjectImage {
+  ObjectImage(); // = delete
+  ObjectImage(const ObjectImage &other); // = delete
+protected:
+  object::ObjectFile *ObjFile;
+
+public:
+  ObjectImage(object::ObjectFile *Obj) { ObjFile = Obj; }
+  virtual ~ObjectImage() {}
+
+  virtual object::symbol_iterator begin_symbols() const
+              { return ObjFile->begin_symbols(); }
+  virtual object::symbol_iterator end_symbols() const
+              { return ObjFile->end_symbols(); }
+
+  virtual object::section_iterator begin_sections() const
+              { return ObjFile->begin_sections(); }
+  virtual object::section_iterator end_sections() const
+              { return ObjFile->end_sections(); }
+
+  virtual /* Triple::ArchType */ unsigned getArch() const
+              { return ObjFile->getArch(); }
+
+  // Subclasses can override these methods to update the image with loaded
+  // addresses for sections and common symbols
+  virtual void updateSectionAddress(const object::SectionRef &Sec,
+                                    uint64_t Addr) {}
+  virtual void updateSymbolAddress(const object::SymbolRef &Sym, uint64_t Addr)
+              {}
+
+  // Subclasses can override this method to provide JIT debugging support
+  virtual void registerWithDebugger() {}
+  virtual void deregisterWithDebugger() {}
+};
+
+} // end namespace llvm
+
+#endif // LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
+
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index ff4a2c8..1b1840a 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -26,11 +26,19 @@ RuntimeDyldImpl::~RuntimeDyldImpl() {}
 
 namespace llvm {
 
-
+namespace {
+  // Helper for extensive error checking in debug builds.
+  error_code Check(error_code Err) {
+    if (Err) {
+      report_fatal_error(Err.message());
+    }
+    return Err;
+  }
+} // end anonymous namespace
 
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
-  // First, resolve relocations assotiated with external symbols.
+  // First, resolve relocations associated with external symbols.
   resolveSymbols();
 
   // Just iterate over the sections we have and resolve all the relocations
@@ -51,90 +59,110 @@ void RuntimeDyldImpl::mapSectionAddress(void *LocalAddress,
   llvm_unreachable("Attempting to remap address of unknown section!");
 }
 
+// Subclasses can implement this method to create specialized image instances
+// The caller owns the the pointer that is returned.
+ObjectImage *RuntimeDyldImpl::createObjectImage(const MemoryBuffer *InputBuffer) {
+  ObjectFile *ObjFile = ObjectFile::createObjectFile(const_cast<MemoryBuffer*>
+                                                                 (InputBuffer));
+  ObjectImage *Obj = new ObjectImage(ObjFile);
+  return Obj;
+}
+
 bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
-  // FIXME: ObjectFile don't modify MemoryBuffer.
-  //        It should use const MemoryBuffer as parameter.
-  ObjectFile *obj = ObjectFile::
-                      createObjectFile(const_cast<MemoryBuffer*>(InputBuffer));
+  OwningPtr<ObjectImage> obj(createObjectImage(InputBuffer));
+  if (!obj)
+    report_fatal_error("Unable to create object image from memory buffer!");
 
   Arch = (Triple::ArchType)obj->getArch();
 
   LocalSymbolMap LocalSymbols;     // Functions and data symbols from the
                                    // object file.
   ObjSectionToIDMap LocalSections; // Used sections from the object file
+  CommonSymbolMap   CommonSymbols; // Common symbols requiring allocation
+  uint64_t          CommonSize = 0;
 
   error_code err;
-
-
   // Parse symbols
   DEBUG(dbgs() << "Parse symbols:\n");
-  for (symbol_iterator it = obj->begin_symbols(), itEnd = obj->end_symbols();
-       it != itEnd; it.increment(err)) {
-    if (err) break;
+  for (symbol_iterator i = obj->begin_symbols(), e = obj->end_symbols();
+       i != e; i.increment(err)) {
+    Check(err);
     object::SymbolRef::Type SymType;
     StringRef Name;
-    if ((bool)(err = it->getType(SymType))) break;
-    if ((bool)(err = it->getName(Name))) break;
-
-    if (SymType == object::SymbolRef::ST_Function ||
-        SymType == object::SymbolRef::ST_Data) {
-      uint64_t FileOffset;
-      uint32_t flags;
-      StringRef sData;
-      section_iterator sIt = obj->end_sections();
-      if ((bool)(err = it->getFileOffset(FileOffset))) break;
-      if ((bool)(err = it->getFlags(flags))) break;
-      if ((bool)(err = it->getSection(sIt))) break;
-      if (sIt == obj->end_sections()) continue;
-      if ((bool)(err = sIt->getContents(sData))) break;
-      const uint8_t* SymPtr = (const uint8_t*)InputBuffer->getBufferStart() +
-                              (uintptr_t)FileOffset;
-      uintptr_t SectOffset = (uintptr_t)(SymPtr - (const uint8_t*)sData.begin());
-      unsigned SectionID =
-        findOrEmitSection(*sIt,
-                          SymType == object::SymbolRef::ST_Function,
-                          LocalSections);
-      bool isGlobal = flags & SymbolRef::SF_Global;
-      LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
-      DEBUG(dbgs() << "\tFileOffset: " << format("%p", (uintptr_t)FileOffset)
-                   << " flags: " << flags
-                   << " SID: " << SectionID
-                   << " Offset: " << format("%p", SectOffset));
-      if (isGlobal)
-        SymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
+    Check(i->getType(SymType));
+    Check(i->getName(Name));
+
+    uint32_t flags;
+    Check(i->getFlags(flags));
+
+    bool isCommon = flags & SymbolRef::SF_Common;
+    if (isCommon) {
+      // Add the common symbols to a list.  We'll allocate them all below.
+      uint64_t Size = 0;
+      Check(i->getSize(Size));
+      CommonSize += Size;
+      CommonSymbols[*i] = Size;
+    } else {
+      if (SymType == object::SymbolRef::ST_Function ||
+          SymType == object::SymbolRef::ST_Data) {
+        uint64_t FileOffset;
+        StringRef sData;
+        section_iterator si = obj->end_sections();
+        Check(i->getFileOffset(FileOffset));
+        Check(i->getSection(si));
+        if (si == obj->end_sections()) continue;
+        Check(si->getContents(sData));
+        const uint8_t* SymPtr = (const uint8_t*)InputBuffer->getBufferStart() +
+                                (uintptr_t)FileOffset;
+        uintptr_t SectOffset = (uintptr_t)(SymPtr - (const uint8_t*)sData.begin());
+        unsigned SectionID =
+          findOrEmitSection(*obj,
+                            *si,
+                            SymType == object::SymbolRef::ST_Function,
+                            LocalSections);
+        bool isGlobal = flags & SymbolRef::SF_Global;
+        LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
+        DEBUG(dbgs() << "\tFileOffset: " << format("%p", (uintptr_t)FileOffset)
+                     << " flags: " << flags
+                     << " SID: " << SectionID
+                     << " Offset: " << format("%p", SectOffset));
+        if (isGlobal)
+          SymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
+      }
     }
     DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name << "\n");
   }
-  if (err) {
-    report_fatal_error(err.message());
-  }
+
+  // Allocate common symbols
+  if (CommonSize != 0)
+    emitCommonSymbols(*obj, CommonSymbols, CommonSize, LocalSymbols);
 
   // Parse and proccess relocations
   DEBUG(dbgs() << "Parse relocations:\n");
-  for (section_iterator sIt = obj->begin_sections(),
-       sItEnd = obj->end_sections(); sIt != sItEnd; sIt.increment(err)) {
-    if (err) break;
+  for (section_iterator si = obj->begin_sections(),
+       se = obj->end_sections(); si != se; si.increment(err)) {
+    Check(err);
     bool isFirstRelocation = true;
     unsigned SectionID = 0;
     StubMap Stubs;
 
-    for (relocation_iterator it = sIt->begin_relocations(),
-         itEnd = sIt->end_relocations(); it != itEnd; it.increment(err)) {
-      if (err) break;
+    for (relocation_iterator i = si->begin_relocations(),
+         e = si->end_relocations(); i != e; i.increment(err)) {
+      Check(err);
 
       // If it's first relocation in this section, find its SectionID
       if (isFirstRelocation) {
-        SectionID = findOrEmitSection(*sIt, true, LocalSections);
+        SectionID = findOrEmitSection(*obj, *si, true, LocalSections);
         DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
         isFirstRelocation = false;
       }
 
       ObjRelocationInfo RI;
       RI.SectionID = SectionID;
-      if ((bool)(err = it->getAdditionalInfo(RI.AdditionalInfo))) break;
-      if ((bool)(err = it->getOffset(RI.Offset))) break;
-      if ((bool)(err = it->getSymbol(RI.Symbol))) break;
-      if ((bool)(err = it->getType(RI.Type))) break;
+      Check(i->getAdditionalInfo(RI.AdditionalInfo));
+      Check(i->getOffset(RI.Offset));
+      Check(i->getSymbol(RI.Symbol));
+      Check(i->getType(RI.Type));
 
       DEBUG(dbgs() << "\t\tAddend: " << RI.AdditionalInfo
                    << " Offset: " << format("%p", (uintptr_t)RI.Offset)
@@ -142,61 +170,138 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
                    << "\n");
       processRelocationRef(RI, *obj, LocalSections, LocalSymbols, Stubs);
     }
-    if (err) {
-      report_fatal_error(err.message());
-    }
   }
+
+  handleObjectLoaded(obj.take());
+
   return false;
 }
 
-unsigned RuntimeDyldImpl::emitSection(const SectionRef &Section,
+unsigned RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
+                                            const CommonSymbolMap &Map,
+                                            uint64_t TotalSize,
+                                            LocalSymbolMap &LocalSymbols) {
+  // Allocate memory for the section
+  unsigned SectionID = Sections.size();
+  uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, sizeof(void*),
+                                              SectionID);
+  if (!Addr)
+    report_fatal_error("Unable to allocate memory for common symbols!");
+  uint64_t Offset = 0;
+  Sections.push_back(SectionEntry(Addr, TotalSize, TotalSize, 0));
+  memset(Addr, 0, TotalSize);
+
+  DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID
+               << " new addr: " << format("%p", Addr)
+               << " DataSize: " << TotalSize
+               << "\n");
+
+  // Assign the address of each symbol
+  for (CommonSymbolMap::const_iterator it = Map.begin(), itEnd = Map.end();
+       it != itEnd; it++) {
+    uint64_t Size = it->second;
+    StringRef Name;
+    it->first.getName(Name);
+    Obj.updateSymbolAddress(it->first, (uint64_t)Addr);
+    LocalSymbols[Name.data()] = SymbolLoc(SectionID, Offset);
+    Offset += Size;
+    Addr += Size;
+  }
+
+  return SectionID;
+}
+
+unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
+                                      const SectionRef &Section,
                                       bool IsCode) {
 
   unsigned StubBufSize = 0,
            StubSize = getMaxStubSize();
   error_code err;
   if (StubSize > 0) {
-    for (relocation_iterator it = Section.begin_relocations(),
-         itEnd = Section.end_relocations(); it != itEnd; it.increment(err))
+    for (relocation_iterator i = Section.begin_relocations(),
+         e = Section.end_relocations(); i != e; i.increment(err), Check(err))
       StubBufSize += StubSize;
   }
   StringRef data;
   uint64_t Alignment64;
-  if ((bool)(err = Section.getContents(data))) report_fatal_error(err.message());
-  if ((bool)(err = Section.getAlignment(Alignment64)))
-    report_fatal_error(err.message());
+  Check(Section.getContents(data));
+  Check(Section.getAlignment(Alignment64));
 
   unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
-  unsigned DataSize = data.size();
-  unsigned Allocate = DataSize + StubBufSize;
+  bool IsRequired;
+  bool IsVirtual;
+  bool IsZeroInit;
+  uint64_t DataSize;
+  Check(Section.isRequiredForExecution(IsRequired));
+  Check(Section.isVirtual(IsVirtual));
+  Check(Section.isZeroInit(IsZeroInit));
+  Check(Section.getSize(DataSize));
+
+  unsigned Allocate;
   unsigned SectionID = Sections.size();
-  const char *pData = data.data();
-  uint8_t *Addr = IsCode
-    ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID)
-    : MemMgr->allocateDataSection(Allocate, Alignment, SectionID);
-
-  memcpy(Addr, pData, DataSize);
-  DEBUG(dbgs() << "emitSection SectionID: " << SectionID
-               << " obj addr: " << format("%p", pData)
-               << " new addr: " << format("%p", Addr)
-               << " DataSize: " << DataSize
-               << " StubBufSize: " << StubBufSize
-               << " Allocate: " << Allocate
-               << "\n");
+  uint8_t *Addr;
+  const char *pData = 0;
+
+  // Some sections, such as debug info, don't need to be loaded for execution.
+  // Leave those where they are.
+  if (IsRequired) {
+    Allocate = DataSize + StubBufSize;
+    Addr = IsCode
+      ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID)
+      : MemMgr->allocateDataSection(Allocate, Alignment, SectionID);
+    if (!Addr)
+      report_fatal_error("Unable to allocate section memory!");
+
+    // Virtual sections have no data in the object image, so leave pData = 0
+    if (!IsVirtual)
+      pData = data.data();
+
+    // Zero-initialize or copy the data from the image
+    if (IsZeroInit || IsVirtual)
+      memset(Addr, 0, DataSize);
+    else
+      memcpy(Addr, pData, DataSize);
+
+    DEBUG(dbgs() << "emitSection SectionID: " << SectionID
+                 << " obj addr: " << format("%p", pData)
+                 << " new addr: " << format("%p", Addr)
+                 << " DataSize: " << DataSize
+                 << " StubBufSize: " << StubBufSize
+                 << " Allocate: " << Allocate
+                 << "\n");
+    Obj.updateSectionAddress(Section, (uint64_t)Addr);
+  }
+  else {
+    // Even if we didn't load the section, we need to record an entry for it
+    //   to handle later processing (and by 'handle' I mean don't do anything
+    //   with these sections).
+    Allocate = 0;
+    Addr = 0;
+    DEBUG(dbgs() << "emitSection SectionID: " << SectionID
+                 << " obj addr: " << format("%p", data.data())
+                 << " new addr: 0"
+                 << " DataSize: " << DataSize
+                 << " StubBufSize: " << StubBufSize
+                 << " Allocate: " << Allocate
+                 << "\n");
+  }
+
   Sections.push_back(SectionEntry(Addr, Allocate, DataSize,(uintptr_t)pData));
   return SectionID;
 }
 
-unsigned RuntimeDyldImpl::
-findOrEmitSection(const SectionRef &Section, bool IsCode,
-                  ObjSectionToIDMap &LocalSections) {
+unsigned RuntimeDyldImpl::findOrEmitSection(ObjectImage &Obj,
+                                            const SectionRef &Section,
+                                            bool IsCode,
+                                            ObjSectionToIDMap &LocalSections) {
 
   unsigned SectionID = 0;
-  ObjSectionToIDMap::iterator sIDIt = LocalSections.find(Section);
-  if (sIDIt != LocalSections.end())
-    SectionID = sIDIt->second;
+  ObjSectionToIDMap::iterator i = LocalSections.find(Section);
+  if (i != LocalSections.end())
+    SectionID = i->second;
   else {
-    SectionID = emitSection(Section, IsCode);
+    SectionID = emitSection(Obj, Section, IsCode);
     LocalSections[Section] = SectionID;
   }
   return SectionID;
@@ -259,15 +364,18 @@ void RuntimeDyldImpl::reassignSectionAddress(unsigned SectionID,
 
 void RuntimeDyldImpl::resolveRelocationEntry(const RelocationEntry &RE,
                                              uint64_t Value) {
-    uint8_t *Target = Sections[RE.SectionID].Address + RE.Offset;
-    DEBUG(dbgs() << "\tSectionID: " << RE.SectionID
-          << " + " << RE.Offset << " (" << format("%p", Target) << ")"
-          << " Data: " << RE.Data
-          << " Addend: " << RE.Addend
-          << "\n");
-
-    resolveRelocation(Target, Sections[RE.SectionID].LoadAddress + RE.Offset,
-                      Value, RE.Data, RE.Addend);
+    // Ignore relocations for sections that were not loaded
+    if (Sections[RE.SectionID].Address != 0) {
+      uint8_t *Target = Sections[RE.SectionID].Address + RE.Offset;
+      DEBUG(dbgs() << "\tSectionID: " << RE.SectionID
+            << " + " << RE.Offset << " (" << format("%p", Target) << ")"
+            << " Data: " << RE.Data
+            << " Addend: " << RE.Addend
+            << "\n");
+
+      resolveRelocation(Target, Sections[RE.SectionID].LoadAddress + RE.Offset,
+                        Value, RE.Data, RE.Addend);
+  }
 }
 
 void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
@@ -280,11 +388,11 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
 // resolveSymbols - Resolve any relocations to the specified symbols if
 // we know where it lives.
 void RuntimeDyldImpl::resolveSymbols() {
-  StringMap<RelocationList>::iterator it = SymbolRelocations.begin(),
-                                      itEnd = SymbolRelocations.end();
-  for (; it != itEnd; it++) {
-    StringRef Name = it->first();
-    RelocationList &Relocs = it->second;
+  StringMap<RelocationList>::iterator i = SymbolRelocations.begin(),
+                                      e = SymbolRelocations.end();
+  for (; i != e; i++) {
+    StringRef Name = i->first();
+    RelocationList &Relocs = i->second;
     StringMap<SymbolLoc>::const_iterator Loc = SymbolTable.find(Name);
     if (Loc == SymbolTable.end()) {
       // This is an external symbol, try to get it address from
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 9351b6c..db6da8c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -20,11 +20,176 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Object/ELF.h"
+#include "JITRegistrar.h"
 using namespace llvm;
 using namespace llvm::object;
 
+namespace {
+
+template<support::endianness target_endianness, bool is64Bits>
+class DyldELFObject : public ELFObjectFile<target_endianness, is64Bits> {
+  LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
+
+  typedef Elf_Shdr_Impl<target_endianness, is64Bits> Elf_Shdr;
+  typedef Elf_Sym_Impl<target_endianness, is64Bits> Elf_Sym;
+  typedef Elf_Rel_Impl<target_endianness, is64Bits, false> Elf_Rel;
+  typedef Elf_Rel_Impl<target_endianness, is64Bits, true> Elf_Rela;
+
+  typedef typename ELFObjectFile<target_endianness, is64Bits>::
+    Elf_Ehdr Elf_Ehdr;
+
+  typedef typename ELFDataTypeTypedefHelper<
+          target_endianness, is64Bits>::value_type addr_type;
+
+protected:
+  // This duplicates the 'Data' member in the 'Binary' base class
+  // but it is necessary to workaround a bug in gcc 4.2
+  MemoryBuffer *InputData;
+
+public:
+  DyldELFObject(MemoryBuffer *Object, error_code &ec);
+
+  void updateSectionAddress(const SectionRef &Sec, uint64_t Addr);
+  void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr);
+
+  const MemoryBuffer& getBuffer() const { return *InputData; }
+
+  // Methods for type inquiry through isa, cast, and dyn_cast
+  static inline bool classof(const Binary *v) {
+    return (isa<ELFObjectFile<target_endianness, is64Bits> >(v)
+            && classof(cast<ELFObjectFile<target_endianness, is64Bits> >(v)));
+  }
+  static inline bool classof(
+      const ELFObjectFile<target_endianness, is64Bits> *v) {
+    return v->isDyldType();
+  }
+  static inline bool classof(const DyldELFObject *v) {
+    return true;
+  }
+};
+
+template<support::endianness target_endianness, bool is64Bits>
+class ELFObjectImage : public ObjectImage {
+  protected:
+    DyldELFObject<target_endianness, is64Bits> *DyldObj;
+    bool Registered;
+
+  public:
+    ELFObjectImage(DyldELFObject<target_endianness, is64Bits> *Obj)
+    : ObjectImage(Obj),
+      DyldObj(Obj),
+      Registered(false) {}
+
+    virtual ~ELFObjectImage() {
+      if (Registered)
+        deregisterWithDebugger();
+    }
+
+    // Subclasses can override these methods to update the image with loaded
+    // addresses for sections and common symbols
+    virtual void updateSectionAddress(const SectionRef &Sec, uint64_t Addr)
+    {
+      DyldObj->updateSectionAddress(Sec, Addr);
+    }
+
+    virtual void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr)
+    {
+      DyldObj->updateSymbolAddress(Sym, Addr);
+    }
+
+    virtual void registerWithDebugger()
+    {
+      JITRegistrar::getGDBRegistrar().registerObject(DyldObj->getBuffer());
+      Registered = true;
+    }
+    virtual void deregisterWithDebugger()
+    {
+      JITRegistrar::getGDBRegistrar().deregisterObject(DyldObj->getBuffer());
+    }
+};
+
+template<support::endianness target_endianness, bool is64Bits>
+DyldELFObject<target_endianness, is64Bits>::DyldELFObject(MemoryBuffer *Object,
+                                                          error_code &ec)
+  : ELFObjectFile<target_endianness, is64Bits>(Object, ec),
+    InputData(Object) {
+  this->isDyldELFObject = true;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+void DyldELFObject<target_endianness, is64Bits>::updateSectionAddress(
+                                                       const SectionRef &Sec,
+                                                       uint64_t Addr) {
+  DataRefImpl ShdrRef = Sec.getRawDataRefImpl();
+  Elf_Shdr *shdr = const_cast<Elf_Shdr*>(
+                          reinterpret_cast<const Elf_Shdr *>(ShdrRef.p));
+
+  // This assumes the address passed in matches the target address bitness
+  // The template-based type cast handles everything else.
+  shdr->sh_addr = static_cast<addr_type>(Addr);
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+void DyldELFObject<target_endianness, is64Bits>::updateSymbolAddress(
+                                                       const SymbolRef &SymRef,
+                                                       uint64_t Addr) {
+
+  Elf_Sym *sym = const_cast<Elf_Sym*>(
+                                 ELFObjectFile<target_endianness, is64Bits>::
+                                   getSymbol(SymRef.getRawDataRefImpl()));
+
+  // This assumes the address passed in matches the target address bitness
+  // The template-based type cast handles everything else.
+  sym->st_value = static_cast<addr_type>(Addr);
+}
+
+} // namespace
+
+
 namespace llvm {
 
+ObjectImage *RuntimeDyldELF::createObjectImage(
+                                         const MemoryBuffer *ConstInputBuffer) {
+  MemoryBuffer *InputBuffer = const_cast<MemoryBuffer*>(ConstInputBuffer);
+  std::pair<unsigned char, unsigned char> Ident = getElfArchType(InputBuffer);
+  error_code ec;
+
+  if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) {
+    DyldELFObject<support::little, false> *Obj =
+           new DyldELFObject<support::little, false>(InputBuffer, ec);
+    return new ELFObjectImage<support::little, false>(Obj);
+  }
+  else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB) {
+    DyldELFObject<support::big, false> *Obj =
+           new DyldELFObject<support::big, false>(InputBuffer, ec);
+    return new ELFObjectImage<support::big, false>(Obj);
+  }
+  else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB) {
+    DyldELFObject<support::big, true> *Obj =
+           new DyldELFObject<support::big, true>(InputBuffer, ec);
+    return new ELFObjectImage<support::big, true>(Obj);
+  }
+  else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) {
+    DyldELFObject<support::little, true> *Obj =
+           new DyldELFObject<support::little, true>(InputBuffer, ec);
+    return new ELFObjectImage<support::little, true>(Obj);
+  }
+  else
+    llvm_unreachable("Unexpected ELF format");
+}
+
+void RuntimeDyldELF::handleObjectLoaded(ObjectImage *Obj)
+{
+  Obj->registerWithDebugger();
+  // Save the loaded object.  It will deregister itself when deleted
+  LoadedObject = Obj;
+}
+
+RuntimeDyldELF::~RuntimeDyldELF() {
+  if (LoadedObject)
+    delete LoadedObject;
+}
 
 void RuntimeDyldELF::resolveX86_64Relocation(uint8_t *LocalAddress,
                                              uint64_t FinalAddress,
@@ -71,7 +236,8 @@ void RuntimeDyldELF::resolveX86Relocation(uint8_t *LocalAddress,
   switch (Type) {
   case ELF::R_386_32: {
     uint32_t *Target = (uint32_t*)(LocalAddress);
-    *Target = Value + Addend;
+    uint32_t Placeholder = *Target;
+    *Target = Placeholder + Value + Addend;
     break;
   }
   case ELF::R_386_PC32: {
@@ -165,10 +331,11 @@ void RuntimeDyldELF::resolveRelocation(uint8_t *LocalAddress,
   }
 }
 
-void RuntimeDyldELF::
-processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
-                     ObjSectionToIDMap &ObjSectionToID,
-                     LocalSymbolMap &Symbols, StubMap &Stubs) {
+void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
+                                          ObjectImage &Obj,
+                                          ObjSectionToIDMap &ObjSectionToID,
+                                          LocalSymbolMap &Symbols,
+                                          StubMap &Stubs) {
 
   uint32_t RelType = (uint32_t)(Rel.Type & 0xffffffffL);
   intptr_t Addend = (intptr_t)Rel.AdditionalInfo;
@@ -181,16 +348,16 @@ processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
                << " TargetName: " << TargetName
                << "\n");
   // First look the symbol in object file symbols.
-  LocalSymbolMap::iterator it = Symbols.find(TargetName.data());
-  if (it != Symbols.end()) {
-    Value.SectionID = it->second.first;
-    Value.Addend = it->second.second;
+  LocalSymbolMap::iterator lsi = Symbols.find(TargetName.data());
+  if (lsi != Symbols.end()) {
+    Value.SectionID = lsi->second.first;
+    Value.Addend = lsi->second.second;
   } else {
     // Second look the symbol in global symbol table.
-    StringMap<SymbolLoc>::iterator itS = SymbolTable.find(TargetName.data());
-    if (itS != SymbolTable.end()) {
-      Value.SectionID = itS->second.first;
-      Value.Addend = itS->second.second;
+    StringMap<SymbolLoc>::iterator gsi = SymbolTable.find(TargetName.data());
+    if (gsi != SymbolTable.end()) {
+      Value.SectionID = gsi->second.first;
+      Value.Addend = gsi->second.second;
     } else {
       SymbolRef::Type SymType;
       Symbol.getType(SymType);
@@ -199,12 +366,12 @@ processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
           // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
           // and can be changed by another developers. Maybe best way is add
           // a new symbol type ST_Section to SymbolRef and use it.
-          section_iterator sIt = Obj.end_sections();
-          Symbol.getSection(sIt);
-          if (sIt == Obj.end_sections())
+          section_iterator si = Obj.end_sections();
+          Symbol.getSection(si);
+          if (si == Obj.end_sections())
             llvm_unreachable("Symbol section not found, bad object file format!");
           DEBUG(dbgs() << "\t\tThis is section symbol\n");
-          Value.SectionID = findOrEmitSection((*sIt), true, ObjSectionToID);
+          Value.SectionID = findOrEmitSection(Obj, (*si), true, ObjSectionToID);
           Value.Addend = Addend;
           break;
         }
@@ -232,10 +399,10 @@ processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
     uint8_t *Target = Section.Address + Rel.Offset;
 
     //  Look up for existing stub.
-    StubMap::const_iterator stubIt = Stubs.find(Value);
-    if (stubIt != Stubs.end()) {
-      resolveRelocation(Target, Section.LoadAddress, (uint64_t)Section.Address +
-                        stubIt->second, RelType, 0);
+    StubMap::const_iterator i = Stubs.find(Value);
+    if (i != Stubs.end()) {
+      resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address +
+                        i->second, RelType, 0);
       DEBUG(dbgs() << " Stub function found\n");
     } else {
       // Create a new stub function.
@@ -245,7 +412,7 @@ processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
                                                    Section.StubOffset);
       AddRelocation(Value, Rel.SectionID,
                     StubTargetAddr - Section.Address, ELF::R_ARM_ABS32);
-      resolveRelocation(Target, Section.LoadAddress, (uint64_t)Section.Address +
+      resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address +
                         Section.StubOffset, RelType, 0);
       Section.StubOffset += getMaxStubSize();
     }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 36566da..e7f6fab 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -22,6 +22,8 @@ using namespace llvm;
 namespace llvm {
 class RuntimeDyldELF : public RuntimeDyldImpl {
 protected:
+  ObjectImage *LoadedObject;
+
   void resolveX86_64Relocation(uint8_t *LocalAddress,
                                uint64_t FinalAddress,
                                uint64_t Value,
@@ -47,12 +49,18 @@ protected:
                                  int64_t Addend);
 
   virtual void processRelocationRef(const ObjRelocationInfo &Rel,
-                                    const ObjectFile &Obj,
+                                    ObjectImage &Obj,
                                     ObjSectionToIDMap &ObjSectionToID,
                                     LocalSymbolMap &Symbols, StubMap &Stubs);
 
+  virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer);
+  virtual void handleObjectLoaded(ObjectImage *Obj);
+
 public:
-  RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
+  RuntimeDyldELF(RTDyldMemoryManager *mm)
+      : RuntimeDyldImpl(mm), LoadedObject(0) {}
+
+  virtual ~RuntimeDyldELF();
 
   bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const;
 };
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index d6430a9..2dea13f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -29,6 +29,7 @@
 #include "llvm/ADT/Triple.h"
 #include <map>
 #include "llvm/Support/Format.h"
+#include "ObjectImage.h"
 
 using namespace llvm;
 using namespace llvm::object;
@@ -110,6 +111,9 @@ protected:
   StringMap<SymbolLoc> SymbolTable;
   typedef DenseMap<const char*, SymbolLoc> LocalSymbolMap;
 
+  // Keep a map of common symbols to their sizes
+  typedef std::map<SymbolRef, unsigned> CommonSymbolMap;
+
   // For each symbol, keep a list of relocations based on it. Anytime
   // its address is reassigned (the JIT re-compiled the function, e.g.),
   // the relocations get re-resolved.
@@ -149,18 +153,29 @@ protected:
     return (uint8_t*)Sections[SectionID].Address;
   }
 
+  /// \brief Emits a section containing common symbols.
+  /// \return SectionID.
+  unsigned emitCommonSymbols(ObjectImage &Obj,
+                             const CommonSymbolMap &Map,
+                             uint64_t TotalSize,
+                             LocalSymbolMap &Symbols);
+
   /// \brief Emits section data from the object file to the MemoryManager.
   /// \param IsCode if it's true then allocateCodeSection() will be
   ///        used for emmits, else allocateDataSection() will be used.
   /// \return SectionID.
-  unsigned emitSection(const SectionRef &Section, bool IsCode);
+  unsigned emitSection(ObjectImage &Obj,
+                       const SectionRef &Section,
+                       bool IsCode);
 
   /// \brief Find Section in LocalSections. If the secton is not found - emit
   ///        it and store in LocalSections.
   /// \param IsCode if it's true then allocateCodeSection() will be
   ///        used for emmits, else allocateDataSection() will be used.
   /// \return SectionID.
-  unsigned findOrEmitSection(const SectionRef &Section, bool IsCode,
+  unsigned findOrEmitSection(ObjectImage &Obj,
+                             const SectionRef &Section,
+                             bool IsCode,
                              ObjSectionToIDMap &LocalSections);
 
   /// \brief If Value.SymbolName is NULL then store relocation to the
@@ -191,11 +206,18 @@ protected:
   /// \brief Parses the object file relocation and store it to Relocations
   ///        or SymbolRelocations. Its depend from object file type.
   virtual void processRelocationRef(const ObjRelocationInfo &Rel,
-                                    const ObjectFile &Obj,
+                                    ObjectImage &Obj,
                                     ObjSectionToIDMap &ObjSectionToID,
                                     LocalSymbolMap &Symbols, StubMap &Stubs) = 0;
 
   void resolveSymbols();
+  virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer);
+  virtual void handleObjectLoaded(ObjectImage *Obj)
+  {
+    // Subclasses may choose to retain this image if they have a use for it
+    delete Obj;
+  }
+
 public:
   RuntimeDyldImpl(RTDyldMemoryManager *mm) : MemMgr(mm), HasError(false) {}
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 24437e0..b7f515d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -42,8 +42,7 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
   // This just dispatches to the proper target specific routine.
   switch (Arch) {
   default: llvm_unreachable("Unsupported CPU type!");
-  case Triple::x86_64: // Fall through.
-  case Triple::x86:
+  case Triple::x86_64:
     resolveX86_64Relocation(LocalAddress,
                             FinalAddress,
                             (uintptr_t)Value,
@@ -52,6 +51,15 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
                             Size,
                             Addend);
     break;
+  case Triple::x86:
+    resolveI386Relocation(LocalAddress,
+                                 FinalAddress,
+                                 (uintptr_t)Value,
+                                 isPCRel,
+                                 Type,
+                                 Size,
+                                 Addend);
+    break;
   case Triple::arm:    // Fall through.
   case Triple::thumb:
     resolveARMRelocation(LocalAddress,
@@ -66,6 +74,35 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
 }
 
 bool RuntimeDyldMachO::
+resolveI386Relocation(uint8_t *LocalAddress,
+                      uint64_t FinalAddress,
+                      uint64_t Value,
+                      bool isPCRel,
+                      unsigned Type,
+                      unsigned Size,
+                      int64_t Addend) {
+  if (isPCRel)
+    Value -= FinalAddress + 4; // see resolveX86_64Relocation
+
+  switch (Type) {
+  default:
+    llvm_unreachable("Invalid relocation type!");
+  case macho::RIT_Vanilla: {
+    uint8_t *p = LocalAddress;
+    uint64_t ValueToWrite = Value + Addend;
+    for (unsigned i = 0; i < Size; ++i) {
+      *p++ = (uint8_t)(ValueToWrite & 0xff);
+      ValueToWrite >>= 8;
+    }
+  }
+  case macho::RIT_Difference:
+  case macho::RIT_Generic_LocalDifference:
+  case macho::RIT_Generic_PreboundLazyPointer:
+    return Error("Relocation type not implemented yet!");
+  }
+}
+
+bool RuntimeDyldMachO::
 resolveX86_64Relocation(uint8_t *LocalAddress,
                         uint64_t FinalAddress,
                         uint64_t Value,
@@ -167,10 +204,11 @@ resolveARMRelocation(uint8_t *LocalAddress,
   return false;
 }
 
-void RuntimeDyldMachO::
-processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
-                     ObjSectionToIDMap &ObjSectionToID,
-                     LocalSymbolMap &Symbols, StubMap &Stubs) {
+void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
+                                            ObjectImage &Obj,
+                                            ObjSectionToIDMap &ObjSectionToID,
+                                            LocalSymbolMap &Symbols,
+                                            StubMap &Stubs) {
 
   uint32_t RelType = (uint32_t) (Rel.Type & 0xffffffffL);
   RelocationValueRef Value;
@@ -183,32 +221,32 @@ processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
     const SymbolRef &Symbol = Rel.Symbol;
     Symbol.getName(TargetName);
     // First look the symbol in object file symbols.
-    LocalSymbolMap::iterator it = Symbols.find(TargetName.data());
-    if (it != Symbols.end()) {
-      Value.SectionID = it->second.first;
-      Value.Addend = it->second.second;
+    LocalSymbolMap::iterator lsi = Symbols.find(TargetName.data());
+    if (lsi != Symbols.end()) {
+      Value.SectionID = lsi->second.first;
+      Value.Addend = lsi->second.second;
     } else {
       // Second look the symbol in global symbol table.
-      StringMap<SymbolLoc>::iterator itS = SymbolTable.find(TargetName.data());
-      if (itS != SymbolTable.end()) {
-        Value.SectionID = itS->second.first;
-        Value.Addend = itS->second.second;
+      StringMap<SymbolLoc>::iterator gsi = SymbolTable.find(TargetName.data());
+      if (gsi != SymbolTable.end()) {
+        Value.SectionID = gsi->second.first;
+        Value.Addend = gsi->second.second;
       } else
         Value.SymbolName = TargetName.data();
     }
   } else {
     error_code err;
-    uint8_t sIdx = static_cast<uint8_t>(RelType & 0xFF);
-    section_iterator sIt = Obj.begin_sections(),
-                     sItEnd = Obj.end_sections();
-    for (uint8_t i = 1; i < sIdx; i++) {
+    uint8_t sectionIndex = static_cast<uint8_t>(RelType & 0xFF);
+    section_iterator si = Obj.begin_sections(),
+                     se = Obj.end_sections();
+    for (uint8_t i = 1; i < sectionIndex; i++) {
       error_code err;
-      sIt.increment(err);
-      if (sIt == sItEnd)
+      si.increment(err);
+      if (si == se)
         break;
     }
-    assert(sIt != sItEnd && "No section containing relocation!");
-    Value.SectionID = findOrEmitSection(*sIt, true, ObjSectionToID);
+    assert(si != se && "No section containing relocation!");
+    Value.SectionID = findOrEmitSection(Obj, *si, true, ObjSectionToID);
     Value.Addend = *(const intptr_t *)Target;
     if (Value.Addend) {
       // The MachO addend is offset from the current section, we need set it
@@ -221,10 +259,10 @@ processRelocationRef(const ObjRelocationInfo &Rel, const ObjectFile &Obj,
     // This is an ARM branch relocation, need to use a stub function.
 
     //  Look up for existing stub.
-    StubMap::const_iterator stubIt = Stubs.find(Value);
-    if (stubIt != Stubs.end())
+    StubMap::const_iterator i = Stubs.find(Value);
+    if (i != Stubs.end())
       resolveRelocation(Target, (uint64_t)Target,
-                        (uint64_t)Section.Address + stubIt->second,
+                        (uint64_t)Section.Address + i->second,
                         RelType, 0);
     else {
       // Create a new stub function.
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 36b39dd..418d130 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -26,6 +26,13 @@ using namespace llvm::object;
 namespace llvm {
 class RuntimeDyldMachO : public RuntimeDyldImpl {
 protected:
+  bool resolveI386Relocation(uint8_t *LocalAddress,
+                             uint64_t FinalAddress,
+                             uint64_t Value,
+                             bool isPCRel,
+                             unsigned Type,
+                             unsigned Size,
+                             int64_t Addend);
   bool resolveX86_64Relocation(uint8_t *LocalAddress,
                                uint64_t FinalAddress,
                                uint64_t Value,
@@ -42,7 +49,7 @@ protected:
                             int64_t Addend);
 
   virtual void processRelocationRef(const ObjRelocationInfo &Rel,
-                                    const ObjectFile &Obj,
+                                    ObjectImage &Obj,
                                     ObjSectionToIDMap &ObjSectionToID,
                                     LocalSymbolMap &Symbols, StubMap &Stubs);
 
@@ -52,7 +59,7 @@ public:
                                  uint64_t Value,
                                  uint32_t Type,
                                  int64_t Addend);
-                                 
+
   RuntimeDyldMachO(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
 
   bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const;
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 3937fe5..42364f9 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/Module.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Target/TargetMachine.h"
@@ -24,17 +25,21 @@
 
 using namespace llvm;
 
+TargetMachine *EngineBuilder::selectTarget() {
+  StringRef MArch = "";
+  StringRef MCPU = "";
+  SmallVector<std::string, 1> MAttrs;
+  Triple TT(M->getTargetTriple());
+
+  return selectTarget(TT, MArch, MCPU, MAttrs);
+}
+
 /// selectTarget - Pick a target either via -march or by guessing the native
 /// arch.  Add any CPU features specified via -mcpu or -mattr.
 TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
                               StringRef MArch,
                               StringRef MCPU,
-                              const SmallVectorImpl<std::string>& MAttrs,
-                              const TargetOptions &Options,
-                              Reloc::Model RM,
-                              CodeModel::Model CM,
-                              CodeGenOpt::Level OL,
-                              std::string *ErrorStr) {
+                              const SmallVectorImpl<std::string>& MAttrs) {
   Triple TheTriple(TargetTriple);
   if (TheTriple.getTriple().empty())
     TheTriple.setTriple(sys::getDefaultTargetTriple());
@@ -84,7 +89,8 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
   TargetMachine *Target = TheTarget->createTargetMachine(TheTriple.getTriple(),
                                                          MCPU, FeaturesStr,
                                                          Options,
-                                                         RM, CM, OL);
+                                                         RelocModel, CMModel,
+                                                         OptLevel);
   assert(Target && "Could not allocate target machine!");
   return Target;
 }
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index f19e6e3..765fcc8 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -20,8 +20,9 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cctype>
@@ -50,8 +51,8 @@ class TypeMapTy : public ValueMapTypeRemapper {
   /// DstResolvedOpaqueTypes - This is the set of opaque types in the
   /// destination modules who are getting a body from the source module.
   SmallPtrSet<StructType*, 16> DstResolvedOpaqueTypes;
+
 public:
-  
   /// addTypeMapping - Indicate that the specified type in the destination
   /// module is conceptually equivalent to the specified type in the source
   /// module.
@@ -67,6 +68,18 @@ public:
 
   FunctionType *get(FunctionType *T) {return cast<FunctionType>(get((Type*)T));}
 
+  /// dump - Dump out the type map for debugging purposes.
+  void dump() const {
+    for (DenseMap<Type*, Type*>::const_iterator
+           I = MappedTypes.begin(), E = MappedTypes.end(); I != E; ++I) {
+      dbgs() << "TypeMap: ";
+      I->first->dump();
+      dbgs() << " => ";
+      I->second->dump();
+      dbgs() << '\n';
+    }
+  }
+
 private:
   Type *getImpl(Type *T);
   /// remapType - Implement the ValueMapTypeRemapper interface.
@@ -224,7 +237,6 @@ void TypeMapTy::linkDefinedTypeBodies() {
   DstResolvedOpaqueTypes.clear();
 }
 
-
 /// get - Return the mapped type to use for the specified input type from the
 /// source module.
 Type *TypeMapTy::get(Type *Ty) {
@@ -328,8 +340,6 @@ Type *TypeMapTy::getImpl(Type *Ty) {
   return *Entry = DTy;
 }
 
-
-
 //===----------------------------------------------------------------------===//
 // ModuleLinker implementation.
 //===----------------------------------------------------------------------===//
@@ -431,8 +441,6 @@ namespace {
   };
 }
 
-
-
 /// forceRenaming - The LLVM SymbolTable class autorenames globals that conflict
 /// in the symbol table.  This is good for all clients except for us.  Go
 /// through the trouble to force this back.
@@ -454,9 +462,9 @@ static void forceRenaming(GlobalValue *GV, StringRef Name) {
   }
 }
 
-/// CopyGVAttributes - copy additional attributes (those not needed to construct
+/// copyGVAttributes - copy additional attributes (those not needed to construct
 /// a GlobalValue) from the SrcGV to the DestGV.
-static void CopyGVAttributes(GlobalValue *DestGV, const GlobalValue *SrcGV) {
+static void copyGVAttributes(GlobalValue *DestGV, const GlobalValue *SrcGV) {
   // Use the maximum alignment, rather than just copying the alignment of SrcGV.
   unsigned Alignment = std::max(DestGV->getAlignment(), SrcGV->getAlignment());
   DestGV->copyAttributesFrom(SrcGV);
@@ -586,14 +594,16 @@ void ModuleLinker::computeTypeMapping() {
   // At this point, the destination module may have a type "%foo = { i32 }" for
   // example.  When the source module got loaded into the same LLVMContext, if
   // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
-  // Though it isn't required for correctness, attempt to link these up to clean
-  // up the IR.
   std::vector<StructType*> SrcStructTypes;
   SrcM->findUsedStructTypes(SrcStructTypes);
-  
   SmallPtrSet<StructType*, 32> SrcStructTypesSet(SrcStructTypes.begin(),
                                                  SrcStructTypes.end());
-  
+
+  std::vector<StructType*> DstStructTypes;
+  DstM->findUsedStructTypes(DstStructTypes);
+  SmallPtrSet<StructType*, 32> DstStructTypesSet(DstStructTypes.begin(),
+                                                 DstStructTypes.end());
+
   for (unsigned i = 0, e = SrcStructTypes.size(); i != e; ++i) {
     StructType *ST = SrcStructTypes[i];
     if (!ST->hasName()) continue;
@@ -606,9 +616,24 @@ void ModuleLinker::computeTypeMapping() {
     
     // Check to see if the destination module has a struct with the prefix name.
     if (StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos)))
-      // Don't use it if this actually came from the source module.  They're in
-      // the same LLVMContext after all.
-      if (!SrcStructTypesSet.count(DST))
+      // Don't use it if this actually came from the source module. They're in
+      // the same LLVMContext after all. Also don't use it unless the type is
+      // actually used in the destination module. This can happen in situations
+      // like this:
+      //
+      //      Module A                         Module B
+      //      --------                         --------
+      //   %Z = type { %A }                %B = type { %C.1 }
+      //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
+      //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
+      //   %C = type { i8* }               %B.3 = type { %C.1 }
+      //
+      // When we link Module B with Module A, the '%B' in Module B is
+      // used. However, that would then use '%C.1'. But when we process '%C.1',
+      // we prefer to take the '%C' version. So we are then left with both
+      // '%C.1' and '%C' being used for the same types. This leads to some
+      // variables using one type and some using the other.
+      if (!SrcStructTypesSet.count(DST) && DstStructTypesSet.count(DST))
         TypeMap.addTypeMapping(DST, ST);
   }
 
@@ -662,7 +687,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
                        DstGV->getType()->getAddressSpace());
   
   // Propagate alignment, visibility and section info.
-  CopyGVAttributes(NG, DstGV);
+  copyGVAttributes(NG, DstGV);
   
   AppendingVarInfo AVI;
   AVI.NewGV = NG;
@@ -736,7 +761,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
                        SGV->isThreadLocal(),
                        SGV->getType()->getAddressSpace());
   // Propagate alignment, visibility and section info.
-  CopyGVAttributes(NewDGV, SGV);
+  copyGVAttributes(NewDGV, SGV);
   if (NewVisibility)
     NewDGV->setVisibility(*NewVisibility);
 
@@ -784,7 +809,7 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
   // bring SF over.
   Function *NewDF = Function::Create(TypeMap.get(SF->getFunctionType()),
                                      SF->getLinkage(), SF->getName(), DstM);
-  CopyGVAttributes(NewDF, SF);
+  copyGVAttributes(NewDF, SF);
   if (NewVisibility)
     NewDF->setVisibility(*NewVisibility);
 
@@ -839,7 +864,7 @@ bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
   GlobalAlias *NewDA = new GlobalAlias(TypeMap.get(SGA->getType()),
                                        SGA->getLinkage(), SGA->getName(),
                                        /*aliasee*/0, DstM);
-  CopyGVAttributes(NewDA, SGA);
+  copyGVAttributes(NewDA, SGA);
   if (NewVisibility)
     NewDA->setVisibility(*NewVisibility);
 
@@ -872,9 +897,8 @@ void ModuleLinker::linkAppendingVarInit(const AppendingVarInfo &AVI) {
   AVI.NewGV->setInitializer(ConstantArray::get(NewType, Elements));
 }
 
-
-// linkGlobalInits - Update the initializers in the Dest module now that all
-// globals that may be referenced are in Dest.
+/// linkGlobalInits - Update the initializers in the Dest module now that all
+/// globals that may be referenced are in Dest.
 void ModuleLinker::linkGlobalInits() {
   // Loop over all of the globals in the src module, mapping them over as we go
   for (Module::const_global_iterator I = SrcM->global_begin(),
@@ -891,9 +915,9 @@ void ModuleLinker::linkGlobalInits() {
   }
 }
 
-// linkFunctionBody - Copy the source function over into the dest function and
-// fix up references to values.  At this point we know that Dest is an external
-// function, and that Src is not.
+/// linkFunctionBody - Copy the source function over into the dest function and
+/// fix up references to values.  At this point we know that Dest is an external
+/// function, and that Src is not.
 void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
   assert(Src && Dst && Dst->isDeclaration() && !Src->isDeclaration());
 
@@ -932,7 +956,7 @@ void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
   
 }
 
-
+/// linkAliasBodies - Insert all of the aliases in Src into the Dest module.
 void ModuleLinker::linkAliasBodies() {
   for (Module::alias_iterator I = SrcM->alias_begin(), E = SrcM->alias_end();
        I != E; ++I) {
@@ -945,7 +969,7 @@ void ModuleLinker::linkAliasBodies() {
   }
 }
 
-/// linkNamedMDNodes - Insert all of the named mdnodes in Src into the Dest
+/// linkNamedMDNodes - Insert all of the named MDNodes in Src into the Dest
 /// module.
 void ModuleLinker::linkNamedMDNodes() {
   const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
@@ -961,7 +985,8 @@ void ModuleLinker::linkNamedMDNodes() {
   }
 }
 
-/// categorizeModuleFlagNodes -
+/// categorizeModuleFlagNodes - Categorize the module flags according to their
+/// type: Error, Warning, Override, and Require.
 bool ModuleLinker::
 categorizeModuleFlagNodes(const NamedMDNode *ModFlags,
                           DenseMap<MDString*, MDNode*> &ErrorNode,
@@ -1220,6 +1245,7 @@ bool ModuleLinker::run() {
     }
     
     linkFunctionBody(cast<Function>(ValueMap[SF]), SF);
+    SF->Dematerialize();
   }
 
   // Resolve all uses of aliases with aliasees.
@@ -1259,7 +1285,8 @@ bool ModuleLinker::run() {
         
         // Link in function body.
         linkFunctionBody(DF, SF);
-        
+        SF->Dematerialize();
+
         // "Remove" from vector by setting the element to 0.
         *I = 0;
         
@@ -1293,11 +1320,11 @@ bool ModuleLinker::run() {
 // LinkModules entrypoint.
 //===----------------------------------------------------------------------===//
 
-// LinkModules - This function links two modules together, with the resulting
-// left module modified to be the composite of the two input modules.  If an
-// error occurs, true is returned and ErrorMsg (if not null) is set to indicate
-// the problem.  Upon failure, the Dest module could be in a modified state, and
-// shouldn't be relied on to be consistent.
+/// LinkModules - This function links two modules together, with the resulting
+/// left module modified to be the composite of the two input modules.  If an
+/// error occurs, true is returned and ErrorMsg (if not null) is set to indicate
+/// the problem.  Upon failure, the Dest module could be in a modified state,
+/// and shouldn't be relied on to be consistent.
 bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode, 
                          std::string *ErrorMsg) {
   ModuleLinker TheLinker(Dest, Src, Mode);
@@ -1305,6 +1332,6 @@ bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode,
     if (ErrorMsg) *ErrorMsg = TheLinker.ErrorMsg;
     return true;
   }
-  
+
   return false;
 }
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 36f94b4..9fc33b6 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -25,13 +24,13 @@
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ELF.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/StringSwitch.h"
 
 #include <vector>
 using namespace llvm;
@@ -84,32 +83,6 @@ class ELFObjectWriter : public MCObjectWriter {
       }
     };
 
-    /// @name Relocation Data
-    /// @{
-
-    struct ELFRelocationEntry {
-      // Make these big enough for both 32-bit and 64-bit
-      uint64_t r_offset;
-      int Index;
-      unsigned Type;
-      const MCSymbol *Symbol;
-      uint64_t r_addend;
-
-      ELFRelocationEntry()
-        : r_offset(0), Index(0), Type(0), Symbol(0), r_addend(0) {}
-
-      ELFRelocationEntry(uint64_t RelocOffset, int Idx,
-                         unsigned RelType, const MCSymbol *Sym,
-                         uint64_t Addend)
-        : r_offset(RelocOffset), Index(Idx), Type(RelType),
-          Symbol(Sym), r_addend(Addend) {}
-
-      // Support lexicographic sorting.
-      bool operator<(const ELFRelocationEntry &RE) const {
-        return RE.r_offset < r_offset;
-      }
-    };
-
     /// The target specific ELF writer instance.
     llvm::OwningPtr<MCELFObjectTargetWriter> TargetObjectWriter;
 
@@ -786,7 +759,7 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
   else
     assert(isInt<32>(Addend));
 
-  ELFRelocationEntry ERE(RelocOffset, Index, Type, RelocSymbol, Addend);
+  ELFRelocationEntry ERE(RelocOffset, Index, Type, RelocSymbol, Addend, Fixup);
   Relocations[Fragment->getParent()].push_back(ERE);
 }
 
@@ -1072,8 +1045,10 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
                                                MCDataFragment *F,
                                                const MCSectionData *SD) {
   std::vector<ELFRelocationEntry> &Relocs = Relocations[SD];
-  // sort by the r_offset just like gnu as does
-  array_pod_sort(Relocs.begin(), Relocs.end());
+
+  // Sort the relocation entries. Most targets just sort by r_offset, but some
+  // (e.g., MIPS) have additional constraints.
+  TargetObjectWriter->sortRelocs(Asm, Relocs);
 
   for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
     ELFRelocationEntry entry = Relocs[e - i - 1];
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 51c3977..0b2e4ae 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 using namespace llvm;
 
 MCAsmBackend::MCAsmBackend()
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 582d21f..8286c1d 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -49,6 +49,7 @@ MCAsmInfo::MCAsmInfo() {
   AllowQuotesInName = false;
   AllowNameToStartWithDigit = false;
   AllowPeriodsInName = true;
+  AllowUTF8 = true;
   ZeroDirective = "\t.zero\t";
   AsciiDirective = "\t.ascii\t";
   AscizDirective = "\t.asciz\t";
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 767ac29..881d992 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoCOFF.h"
-#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 void MCAsmInfoCOFF::anchor() { }
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 5ffb6f2..bb67868 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -13,13 +13,13 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index 572a5a5..35f675d 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -15,7 +15,9 @@
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
@@ -57,6 +59,9 @@ LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
   const MCAsmInfo *MAI = TheTarget->createMCAsmInfo(TripleName);
   assert(MAI && "Unable to create target asm info!");
 
+  const MCInstrInfo *MII = TheTarget->createMCInstrInfo();
+  assert(MII && "Unable to create target instruction info!");
+
   const MCRegisterInfo *MRI = TheTarget->createMCRegInfo(TripleName);
   assert(MRI && "Unable to create target register info!");
 
@@ -80,13 +85,13 @@ LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
   // Set up the instruction printer.
   int AsmPrinterVariant = MAI->getAssemblerDialect();
   MCInstPrinter *IP = TheTarget->createMCInstPrinter(AsmPrinterVariant,
-                                                     *MAI, *MRI, *STI);
+                                                     *MAI, *MII, *MRI, *STI);
   assert(IP && "Unable to create instruction printer!");
 
   LLVMDisasmContext *DC = new LLVMDisasmContext(TripleName, DisInfo, TagType,
                                                 GetOpInfo, SymbolLookUp,
                                                 TheTarget, MAI, MRI,
-                                                Ctx, DisAsm, IP);
+                                                STI, MII, Ctx, DisAsm, IP);
   assert(DC && "Allocation failure!");
 
   return DC;
diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index 238ff7d..880a31a 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h
@@ -28,7 +28,9 @@ class MCContext;
 class MCAsmInfo;
 class MCDisassembler;
 class MCInstPrinter; 
+class MCInstrInfo;
 class MCRegisterInfo;
+class MCSubtargetInfo;
 class Target;
 
 //
@@ -61,6 +63,10 @@ private:
   llvm::OwningPtr<const llvm::MCAsmInfo> MAI;
   // The register information for the target architecture.
   llvm::OwningPtr<const llvm::MCRegisterInfo> MRI;
+  // The subtarget information for the target architecture.
+  llvm::OwningPtr<const llvm::MCSubtargetInfo> MSI;
+  // The instruction information for the target architecture.
+  llvm::OwningPtr<const llvm::MCInstrInfo> MII;
   // The assembly context for creating symbols and MCExprs.
   llvm::OwningPtr<const llvm::MCContext> Ctx;
   // The disassembler for the target architecture.
@@ -78,6 +84,8 @@ public:
                     LLVMSymbolLookupCallback symbolLookUp,
                     const Target *theTarget, const MCAsmInfo *mAI,
                     const MCRegisterInfo *mRI,
+                    const MCSubtargetInfo *mSI,
+                    const MCInstrInfo *mII,
                     llvm::MCContext *ctx, const MCDisassembler *disAsm,
                     MCInstPrinter *iP) : TripleName(tripleName),
                     DisInfo(disInfo), TagType(tagType), GetOpInfo(getOpInfo),
@@ -85,6 +93,8 @@ public:
                     CommentStream(CommentsToEmit) {
     MAI.reset(mAI);
     MRI.reset(mRI);
+    MSI.reset(mSI);
+    MII.reset(mII);
     Ctx.reset(ctx);
     DisAsm.reset(disAsm);
     IP.reset(iP);
diff --git a/lib/MC/MCDisassembler/EDDisassembler.cpp b/lib/MC/MCDisassembler/EDDisassembler.cpp
index 4c2dae8..b2672ca 100644
--- a/lib/MC/MCDisassembler/EDDisassembler.cpp
+++ b/lib/MC/MCDisassembler/EDDisassembler.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -165,11 +166,16 @@ EDDisassembler::EDDisassembler(CPUKey &key) :
     return;
     
   InstInfos = Disassembler->getEDInfo();
-  
+
+  MII.reset(Tgt->createMCInstrInfo());
+
+  if (!MII)
+    return;
+
   InstString.reset(new std::string);
   InstStream.reset(new raw_string_ostream(*InstString));
   InstPrinter.reset(Tgt->createMCInstPrinter(LLVMSyntaxVariant, *AsmInfo,
-                                             *MRI, *STI));
+                                             *MII, *MRI, *STI));
   
   if (!InstPrinter)
     return;
diff --git a/lib/MC/MCDisassembler/EDDisassembler.h b/lib/MC/MCDisassembler/EDDisassembler.h
index e97f11d..6f71908 100644
--- a/lib/MC/MCDisassembler/EDDisassembler.h
+++ b/lib/MC/MCDisassembler/EDDisassembler.h
@@ -36,8 +36,9 @@ class MCContext;
 class MCAsmInfo;
 class MCAsmLexer;
 class MCDisassembler;
-class MCInstPrinter;
 class MCInst;
+class MCInstPrinter;
+class MCInstrInfo;
 class MCParsedAsmOperand;
 class MCRegisterInfo;
 class MCStreamer;
@@ -137,6 +138,8 @@ struct EDDisassembler {
   llvm::OwningPtr<const llvm::MCAsmInfo> AsmInfo;
   /// The subtarget information for the target architecture
   llvm::OwningPtr<const llvm::MCSubtargetInfo> STI;
+  // The instruction information for the target architecture.
+  llvm::OwningPtr<const llvm::MCInstrInfo> MII;
   // The register information for the target architecture.
   llvm::OwningPtr<const llvm::MCRegisterInfo> MRI;
   /// The disassembler for the target architecture
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index e16f7ae..84a34f1 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
@@ -1361,12 +1361,10 @@ namespace llvm {
       return CIEKey::getTombstoneKey();
     }
     static unsigned getHashValue(const CIEKey &Key) {
-      FoldingSetNodeID ID;
-      ID.AddPointer(Key.Personality);
-      ID.AddInteger(Key.PersonalityEncoding);
-      ID.AddInteger(Key.LsdaEncoding);
-      ID.AddBoolean(Key.IsSignalFrame);
-      return ID.ComputeHash();
+      return static_cast<unsigned>(hash_combine(Key.Personality,
+                                                Key.PersonalityEncoding,
+                                                Key.LsdaEncoding,
+                                                Key.IsSignalFrame));
     }
     static bool isEqual(const CIEKey &LHS,
                         const CIEKey &RHS) {
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index 15bf476..171ab4d 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 
 using namespace llvm;
@@ -36,3 +37,10 @@ const MCSymbol *MCELFObjectTargetWriter::ExplicitRelSym(const MCAssembler &Asm,
 void MCELFObjectTargetWriter::adjustFixupOffset(const MCFixup &Fixup,
                                                 uint64_t &RelocOffset) {
 }
+
+void
+MCELFObjectTargetWriter::sortRelocs(const MCAssembler &Asm,
+                                    std::vector<ELFRelocationEntry> &Relocs) {
+  // Sort by the r_offset, just like gnu as does.
+  array_pod_sort(Relocs.begin(), Relocs.end());
+}
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index ceaecd0..7880155 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 3060ad6..847bcc0 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -20,7 +21,7 @@ MCInstPrinter::~MCInstPrinter() {
 /// getOpcodeName - Return the name of the specified opcode enum (e.g.
 /// "MOV32ri") or empty if we can't resolve it.
 StringRef MCInstPrinter::getOpcodeName(unsigned Opcode) const {
-  return "";
+  return MII.getName(Opcode);
 }
 
 void MCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 7dd06e7..b22ae33 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -260,9 +260,14 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
 
   // Solaris requires different flags for .eh_frame to seemingly every other
   // platform.
+  EHSectionType = ELF::SHT_PROGBITS;
   EHSectionFlags = ELF::SHF_ALLOC;
-  if (T.getOS() == Triple::Solaris)
-    EHSectionFlags |= ELF::SHF_WRITE;
+  if (T.getOS() == Triple::Solaris) {
+    if (T.getArch() == Triple::x86_64)
+      EHSectionType = ELF::SHT_X86_64_UNWIND;
+    else
+      EHSectionFlags |= ELF::SHF_WRITE;
+  }
 
 
   // ELF
@@ -575,7 +580,7 @@ void MCObjectFileInfo::InitEHFrameSection() {
                            SectionKind::getReadOnly());
   else if (Env == IsELF)
     EHFrameSection =
-      Ctx->getELFSection(".eh_frame", ELF::SHT_PROGBITS,
+      Ctx->getELFSection(".eh_frame", EHSectionType,
                          EHSectionFlags,
                          SectionKind::getDataRel());
   else
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 906bdc3..bad7cfe 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -7,17 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
-
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
 MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index ab5ddaf..8aef43c 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -1528,11 +1527,11 @@ bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
     }
     Lex();
   }
-  // If there weren't any arguments, erase the token vector so everything
-  // else knows that. Leaving around the vestigal empty token list confuses
-  // things.
-  if (MacroArguments.size() == 1 && MacroArguments.back().empty())
-    MacroArguments.clear();
+  // If the last argument didn't end up with any tokens, it's not a real
+  // argument and we should remove it from the list. This happens with either
+  // a tailing comma or an empty argument list.
+  if (MacroArguments.back().empty())
+    MacroArguments.pop_back();
 
   // Macro instantiation is lexical, unfortunately. We construct a new buffer
   // to hold the macro body with substitutions.
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 57f90d9..8e4066c 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -8,13 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbol.h"
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 7238c9e..67dc649 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -25,7 +25,6 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCWin64EH.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/ADT/StringMap.h"
 
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index b8ba905..bd27a56 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -282,7 +282,6 @@ error_code COFFObjectFile::getSymbolSection(DataRefImpl Symb,
     const coff_section *sec = 0;
     if (error_code ec = getSection(symb->SectionNumber, sec)) return ec;
     DataRefImpl Sec;
-    std::memset(&Sec, 0, sizeof(Sec));
     Sec.p = reinterpret_cast<uintptr_t>(sec);
     Result = section_iterator(SectionRef(Sec, this));
   }
@@ -357,6 +356,27 @@ error_code COFFObjectFile::isSectionBSS(DataRefImpl Sec,
   return object_error::success;
 }
 
+error_code COFFObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
+                                                         bool &Result) const {
+  // FIXME: Unimplemented
+  Result = true;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionVirtual(DataRefImpl Sec,
+                                           bool &Result) const {
+  const coff_section *sec = toSec(Sec);
+  Result = sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionZeroInit(DataRefImpl Sec,
+                                             bool &Result) const {
+  // FIXME: Unimplemented
+  Result = false;
+  return object_error::success;
+}
+
 error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl Sec,
                                                  DataRefImpl Symb,
                                                  bool &Result) const {
@@ -374,7 +394,6 @@ error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl Sec,
 relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
   const coff_section *sec = toSec(Sec);
   DataRefImpl ret;
-  std::memset(&ret, 0, sizeof(ret));
   if (sec->NumberOfRelocations == 0)
     ret.p = 0;
   else
@@ -386,7 +405,6 @@ relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
 relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
   const coff_section *sec = toSec(Sec);
   DataRefImpl ret;
-  std::memset(&ret, 0, sizeof(ret));
   if (sec->NumberOfRelocations == 0)
     ret.p = 0;
   else
@@ -470,7 +488,6 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
 
 symbol_iterator COFFObjectFile::begin_symbols() const {
   DataRefImpl ret;
-  std::memset(&ret, 0, sizeof(DataRefImpl));
   ret.p = reinterpret_cast<intptr_t>(SymbolTable);
   return symbol_iterator(SymbolRef(ret, this));
 }
@@ -478,7 +495,6 @@ symbol_iterator COFFObjectFile::begin_symbols() const {
 symbol_iterator COFFObjectFile::end_symbols() const {
   // The symbol table ends where the string table begins.
   DataRefImpl ret;
-  std::memset(&ret, 0, sizeof(DataRefImpl));
   ret.p = reinterpret_cast<intptr_t>(StringTable);
   return symbol_iterator(SymbolRef(ret, this));
 }
@@ -511,14 +527,12 @@ StringRef COFFObjectFile::getLoadName() const {
 
 section_iterator COFFObjectFile::begin_sections() const {
   DataRefImpl ret;
-  std::memset(&ret, 0, sizeof(DataRefImpl));
   ret.p = reinterpret_cast<intptr_t>(SectionTable);
   return section_iterator(SectionRef(ret, this));
 }
 
 section_iterator COFFObjectFile::end_sections() const {
   DataRefImpl ret;
-  std::memset(&ret, 0, sizeof(DataRefImpl));
   ret.p = reinterpret_cast<intptr_t>(SectionTable + Header->NumberOfSections);
   return section_iterator(SectionRef(ret, this));
 }
@@ -669,7 +683,6 @@ error_code COFFObjectFile::getRelocationSymbol(DataRefImpl Rel,
                                                SymbolRef &Res) const {
   const coff_relocation* R = toRel(Rel);
   DataRefImpl Symb;
-  std::memset(&Symb, 0, sizeof(Symb));
   Symb.p = reinterpret_cast<uintptr_t>(SymbolTable + R->SymbolTableIndex);
   Res = SymbolRef(Symb, this);
   return object_error::success;
@@ -749,7 +762,6 @@ error_code COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
   const coff_symbol *symb = 0;
   if (error_code ec = getSymbol(reloc->SymbolTableIndex, symb)) return ec;
   DataRefImpl sym;
-  ::memset(&sym, 0, sizeof(sym));
   sym.p = reinterpret_cast<uintptr_t>(symb);
   StringRef symname;
   if (error_code ec = getSymbolName(sym, symname)) return ec;
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index ab5f810..663b84e 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -17,16 +17,6 @@ namespace llvm {
 
 using namespace object;
 
-namespace {
-  std::pair<unsigned char, unsigned char>
-  getElfArchType(MemoryBuffer *Object) {
-    if (Object->getBufferSize() < ELF::EI_NIDENT)
-      return std::make_pair((uint8_t)ELF::ELFCLASSNONE,(uint8_t)ELF::ELFDATANONE);
-    return std::make_pair( (uint8_t)Object->getBufferStart()[ELF::EI_CLASS]
-                         , (uint8_t)Object->getBufferStart()[ELF::EI_DATA]);
-  }
-}
-
 // Creates an in-memory object-file by default: createELFObjectFile(Buffer)
 ObjectFile *ObjectFile::createELFObjectFile(MemoryBuffer *Object) {
   std::pair<unsigned char, unsigned char> Ident = getElfArchType(Object);
diff --git a/lib/Object/LLVMBuild.txt b/lib/Object/LLVMBuild.txt
index 0041acd..69610f9 100644
--- a/lib/Object/LLVMBuild.txt
+++ b/lib/Object/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Object
 parent = Libraries
-required_libraries = Core Support
+required_libraries = Support
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 819409e..3bcda17 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -34,7 +34,6 @@ MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, MachOObject *MOO,
       MachOObj(MOO),
       RegisteredStringTable(std::numeric_limits<uint32_t>::max()) {
   DataRefImpl DRI;
-  DRI.d.a = DRI.d.b = 0;
   moveToNextSection(DRI);
   uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands;
   while (DRI.d.a < LoadCommandCount) {
@@ -176,7 +175,12 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
     BeginOffset = Entry->Value;
     SectionIndex = Entry->SectionIndex;
     if (!SectionIndex) {
-      Result = UnknownAddressOrSize;
+      uint32_t flags = SymbolRef::SF_None;
+      getSymbolFlags(DRI, flags);
+      if (flags & SymbolRef::SF_Common)
+        Result = Entry->Value;
+      else
+        Result = UnknownAddressOrSize;
       return object_error::success;
     }
     // Unfortunately symbols are unsorted so we need to touch all
@@ -199,7 +203,12 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
     BeginOffset = Entry->Value;
     SectionIndex = Entry->SectionIndex;
     if (!SectionIndex) {
-      Result = UnknownAddressOrSize;
+      uint32_t flags = SymbolRef::SF_None;
+      getSymbolFlags(DRI, flags);
+      if (flags & SymbolRef::SF_Common)
+        Result = Entry->Value;
+      else
+        Result = UnknownAddressOrSize;
       return object_error::success;
     }
     // Unfortunately symbols are unsorted so we need to touch all
@@ -278,7 +287,7 @@ error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
     MachOType = Entry->Type;
   }
 
-  // TODO: Correctly set SF_ThreadLocal and SF_Common.
+  // TODO: Correctly set SF_ThreadLocal
   Result = SymbolRef::SF_None;
 
   if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined)
@@ -287,8 +296,11 @@ error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
   if (MachOFlags & macho::STF_StabsEntryMask)
     Result |= SymbolRef::SF_FormatSpecific;
 
-  if (MachOType & MachO::NlistMaskExternal)
+  if (MachOType & MachO::NlistMaskExternal) {
     Result |= SymbolRef::SF_Global;
+    if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined)
+      Result |= SymbolRef::SF_Common;
+  }
 
   if (MachOFlags & (MachO::NListDescWeakRef | MachO::NListDescWeakDef))
     Result |= SymbolRef::SF_Weak;
@@ -355,7 +367,6 @@ error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
 symbol_iterator MachOObjectFile::begin_symbols() const {
   // DRI.d.a = segment number; DRI.d.b = symbol index.
   DataRefImpl DRI;
-  DRI.d.a = DRI.d.b = 0;
   moveToNextSymbol(DRI);
   return symbol_iterator(SymbolRef(DRI, this));
 }
@@ -363,7 +374,6 @@ symbol_iterator MachOObjectFile::begin_symbols() const {
 symbol_iterator MachOObjectFile::end_symbols() const {
   DataRefImpl DRI;
   DRI.d.a = MachOObj->getHeader().NumLoadCommands;
-  DRI.d.b = 0;
   return symbol_iterator(SymbolRef(DRI, this));
 }
 
@@ -569,6 +579,37 @@ error_code MachOObjectFile::isSectionBSS(DataRefImpl DRI,
   return object_error::success;
 }
 
+error_code MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
+                                                          bool &Result) const {
+  // FIXME: Unimplemented
+  Result = true;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
+                                            bool &Result) const {
+  // FIXME: Unimplemented
+  Result = false;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionZeroInit(DataRefImpl DRI,
+                                              bool &Result) const {
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(DRI, Sect);
+    Result = (Sect->Flags & MachO::SectionTypeZeroFill ||
+              Sect->Flags & MachO::SectionTypeZeroFillLarge);
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(DRI, Sect);
+    Result = (Sect->Flags & MachO::SectionTypeZeroFill ||
+              Sect->Flags & MachO::SectionTypeZeroFillLarge);
+  }
+
+  return object_error::success;
+}
+
 error_code MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
                                                   DataRefImpl Symb,
                                                   bool &Result) const {
@@ -601,7 +642,6 @@ error_code MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
 
 relocation_iterator MachOObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
   DataRefImpl ret;
-  ret.d.a = 0;
   ret.d.b = getSectionIndex(Sec);
   return relocation_iterator(RelocationRef(ret, this));
 }
@@ -624,7 +664,6 @@ relocation_iterator MachOObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
 
 section_iterator MachOObjectFile::begin_sections() const {
   DataRefImpl DRI;
-  DRI.d.a = DRI.d.b = 0;
   moveToNextSection(DRI);
   return section_iterator(SectionRef(DRI, this));
 }
@@ -632,7 +671,6 @@ section_iterator MachOObjectFile::begin_sections() const {
 section_iterator MachOObjectFile::end_sections() const {
   DataRefImpl DRI;
   DRI.d.a = MachOObj->getHeader().NumLoadCommands;
-  DRI.d.b = 0;
   return section_iterator(SectionRef(DRI, this));
 }
 
@@ -708,7 +746,6 @@ error_code MachOObjectFile::getRelocationSymbol(DataRefImpl Rel,
   bool isExtern = (RE->Word1 >> 27) & 1;
 
   DataRefImpl Sym;
-  Sym.d.a = Sym.d.b = 0;
   moveToNextSymbol(Sym);
   if (isExtern) {
     for (unsigned i = 0; i < SymbolIdx; i++) {
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 0b69238..9103327 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -32,7 +32,7 @@ add_llvm_library(LLVMSupport
   IntrusiveRefCntPtr.cpp
   IsInf.cpp
   IsNAN.cpp
-  JSONParser.cpp
+  Locale.cpp
   LockFileManager.cpp
   ManagedStatic.cpp
   MemoryBuffer.cpp
@@ -54,6 +54,7 @@ add_llvm_library(LLVMSupport
   ToolOutputFile.cpp
   Triple.cpp
   Twine.cpp
+  YAMLParser.cpp
   raw_os_ostream.cpp
   raw_ostream.cpp
   regcomp.c
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index b317e49..5c59a3e 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -95,7 +95,7 @@ const char *llvm::dwarf::TagString(unsigned Tag) {
     return "DW_TAG_GNU_template_parameter_pack";
   case DW_TAG_GNU_formal_parameter_pack:
     return "DW_TAG_GNU_formal_parameter_pack";
-  case DW_TAG_APPLE_Property:            return "DW_TAG_APPLE_property";
+  case DW_TAG_APPLE_property:            return "DW_TAG_APPLE_property";
   }
   return 0;
 }
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index e029970..c6282c6 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp
@@ -265,15 +265,15 @@ void FoldingSetImpl::GrowHashTable() {
 FoldingSetImpl::Node
 *FoldingSetImpl::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
                                      void *&InsertPos) {
-  
-  void **Bucket = GetBucketFor(ID.ComputeHash(), Buckets, NumBuckets);
+  unsigned IDHash = ID.ComputeHash();
+  void **Bucket = GetBucketFor(IDHash, Buckets, NumBuckets);
   void *Probe = *Bucket;
   
   InsertPos = 0;
   
   FoldingSetNodeID TempID;
   while (Node *NodeInBucket = GetNextPtr(Probe)) {
-    if (NodeEquals(NodeInBucket, ID, TempID))
+    if (NodeEquals(NodeInBucket, ID, IDHash, TempID))
       return NodeInBucket;
     TempID.clear();
 
diff --git a/lib/Support/JSONParser.cpp b/lib/Support/JSONParser.cpp
deleted file mode 100644
index 5dfcf29..0000000
--- a/lib/Support/JSONParser.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-//===--- JSONParser.cpp - Simple JSON parser ------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file implements a JSON parser.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/JSONParser.h"
-
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-using namespace llvm;
-
-JSONParser::JSONParser(StringRef Input, SourceMgr *SM)
-  : SM(SM), Failed(false) {
-  InputBuffer = MemoryBuffer::getMemBuffer(Input, "JSON");
-  SM->AddNewSourceBuffer(InputBuffer, SMLoc());
-  End = InputBuffer->getBuffer().end();
-  Position = InputBuffer->getBuffer().begin();
-}
-
-JSONValue *JSONParser::parseRoot() {
-  if (Position != InputBuffer->getBuffer().begin())
-    report_fatal_error("Cannot reuse JSONParser.");
-  if (isWhitespace())
-    nextNonWhitespace();
-  if (errorIfAtEndOfFile("'[' or '{' at start of JSON text"))
-    return 0;
-  switch (*Position) {
-    case '[':
-      return new (ValueAllocator.Allocate<JSONArray>(1)) JSONArray(this);
-    case '{':
-      return new (ValueAllocator.Allocate<JSONObject>(1)) JSONObject(this);
-    default:
-      setExpectedError("'[' or '{' at start of JSON text", *Position);
-      return 0;
-  }
-}
-
-bool JSONParser::validate() {
-  JSONValue *Root = parseRoot();
-  if (Root == NULL) {
-    return false;
-  }
-  return skip(*Root);
-}
-
-bool JSONParser::skip(const JSONAtom &Atom) {
-  switch(Atom.getKind()) {
-    case JSONAtom::JK_Array:
-    case JSONAtom::JK_Object:
-      return skipContainer(*cast<JSONContainer>(&Atom));
-    case JSONAtom::JK_String:
-      return true;
-    case JSONAtom::JK_KeyValuePair:
-      return skip(*cast<JSONKeyValuePair>(&Atom)->Value);
-  }
-  llvm_unreachable("Impossible enum value.");
-}
-
-// Sets the current error to:
-// "expected <Expected>, but found <Found>".
-void JSONParser::setExpectedError(StringRef Expected, StringRef Found) {
-  SM->PrintMessage(SMLoc::getFromPointer(Position), SourceMgr::DK_Error,
-    "expected " + Expected + ", but found " + Found + ".", ArrayRef<SMRange>());
-  Failed = true;
-}
-
-// Sets the current error to:
-// "expected <Expected>, but found <Found>".
-void JSONParser::setExpectedError(StringRef Expected, char Found) {
-  setExpectedError(Expected, ("'" + StringRef(&Found, 1) + "'").str());
-}
-
-// If there is no character available, returns true and sets the current error
-// to: "expected <Expected>, but found EOF.".
-bool JSONParser::errorIfAtEndOfFile(StringRef Expected) {
-  if (Position == End) {
-    setExpectedError(Expected, "EOF");
-    return true;
-  }
-  return false;
-}
-
-// Sets the current error if the current character is not C to:
-// "expected 'C', but got <current character>".
-bool JSONParser::errorIfNotAt(char C, StringRef Message) {
-  if (*Position != C) {
-    std::string Expected =
-      ("'" + StringRef(&C, 1) + "' " + Message).str();
-    if (Position == End)
-      setExpectedError(Expected, "EOF");
-    else
-      setExpectedError(Expected, *Position);
-    return true;
-  }
-  return false;
-}
-
-// Forbidding inlining improves performance by roughly 20%.
-// FIXME: Remove once llvm optimizes this to the faster version without hints.
-LLVM_ATTRIBUTE_NOINLINE static bool
-wasEscaped(StringRef::iterator First, StringRef::iterator Position);
-
-// Returns whether a character at 'Position' was escaped with a leading '\'.
-// 'First' specifies the position of the first character in the string.
-static bool wasEscaped(StringRef::iterator First,
-                       StringRef::iterator Position) {
-  assert(Position - 1 >= First);
-  StringRef::iterator I = Position - 1;
-  // We calulate the number of consecutive '\'s before the current position
-  // by iterating backwards through our string.
-  while (I >= First && *I == '\\') --I;
-  // (Position - 1 - I) now contains the number of '\'s before the current
-  // position. If it is odd, the character at 'Positon' was escaped.
-  return (Position - 1 - I) % 2 == 1;
-}
-
-// Parses a JSONString, assuming that the current position is on a quote.
-JSONString *JSONParser::parseString() {
-  assert(Position != End);
-  assert(!isWhitespace());
-  if (errorIfNotAt('"', "at start of string"))
-    return 0;
-  StringRef::iterator First = Position + 1;
-
-  // Benchmarking shows that this loop is the hot path of the application with
-  // about 2/3rd of the runtime cycles. Since escaped quotes are not the common
-  // case, and multiple escaped backslashes before escaped quotes are very rare,
-  // we pessimize this case to achieve a smaller inner loop in the common case.
-  // We're doing that by having a quick inner loop that just scans for the next
-  // quote. Once we find the quote we check the last character to see whether
-  // the quote might have been escaped. If the last character is not a '\', we
-  // know the quote was not escaped and have thus found the end of the string.
-  // If the immediately preceding character was a '\', we have to scan backwards
-  // to see whether the previous character was actually an escaped backslash, or
-  // an escape character for the quote. If we find that the current quote was
-  // escaped, we continue parsing for the next quote and repeat.
-  // This optimization brings around 30% performance improvements.
-  do {
-    // Step over the current quote.
-    ++Position;
-    // Find the next quote.
-    while (Position != End && *Position != '"')
-      ++Position;
-    if (errorIfAtEndOfFile("'\"' at end of string"))
-      return 0;
-    // Repeat until the previous character was not a '\' or was an escaped
-    // backslash.
-  } while (*(Position - 1) == '\\' && wasEscaped(First, Position));
-
-  return new (ValueAllocator.Allocate<JSONString>())
-      JSONString(StringRef(First, Position - First));
-}
-
-
-// Advances the position to the next non-whitespace position.
-void JSONParser::nextNonWhitespace() {
-  do {
-    ++Position;
-  } while (isWhitespace());
-}
-
-// Checks if there is a whitespace character at the current position.
-bool JSONParser::isWhitespace() {
-  return *Position == ' ' || *Position == '\t' ||
-         *Position == '\n' || *Position == '\r';
-}
-
-bool JSONParser::failed() const {
-  return Failed;
-}
-
-// Parses a JSONValue, assuming that the current position is at the first
-// character of the value.
-JSONValue *JSONParser::parseValue() {
-  assert(Position != End);
-  assert(!isWhitespace());
-  switch (*Position) {
-    case '[':
-      return new (ValueAllocator.Allocate<JSONArray>(1)) JSONArray(this);
-    case '{':
-      return new (ValueAllocator.Allocate<JSONObject>(1)) JSONObject(this);
-    case '"':
-      return parseString();
-    default:
-      setExpectedError("'[', '{' or '\"' at start of value", *Position);
-      return 0;
-  }
-}
-
-// Parses a JSONKeyValuePair, assuming that the current position is at the first
-// character of the key, value pair.
-JSONKeyValuePair *JSONParser::parseKeyValuePair() {
-  assert(Position != End);
-  assert(!isWhitespace());
-
-  JSONString *Key = parseString();
-  if (Key == 0)
-    return 0;
-
-  nextNonWhitespace();
-  if (errorIfNotAt(':', "between key and value"))
-    return 0;
-
-  nextNonWhitespace();
-  const JSONValue *Value = parseValue();
-  if (Value == 0)
-    return 0;
-
-  return new (ValueAllocator.Allocate<JSONKeyValuePair>(1))
-    JSONKeyValuePair(Key, Value);
-}
-
-/// \brief Parses the first element of a JSON array or object, or closes the
-/// array.
-///
-/// The method assumes that the current position is before the first character
-/// of the element, with possible white space in between. When successful, it
-/// returns the new position after parsing the element. Otherwise, if there is
-/// no next value, it returns a default constructed StringRef::iterator.
-StringRef::iterator JSONParser::parseFirstElement(JSONAtom::Kind ContainerKind,
-                                                  char StartChar, char EndChar,
-                                                  const JSONAtom *&Element) {
-  assert(*Position == StartChar);
-  Element = 0;
-  nextNonWhitespace();
-  if (errorIfAtEndOfFile("value or end of container at start of container"))
-    return StringRef::iterator();
-
-  if (*Position == EndChar)
-    return StringRef::iterator();
-
-  Element = parseElement(ContainerKind);
-  if (Element == 0)
-    return StringRef::iterator();
-
-  return Position;
-}
-
-/// \brief Parses the next element of a JSON array or object, or closes the
-/// array.
-///
-/// The method assumes that the current position is before the ',' which
-/// separates the next element from the current element. When successful, it
-/// returns the new position after parsing the element. Otherwise, if there is
-/// no next value, it returns a default constructed StringRef::iterator.
-StringRef::iterator JSONParser::parseNextElement(JSONAtom::Kind ContainerKind,
-                                                 char EndChar,
-                                                 const JSONAtom *&Element) {
-  Element = 0;
-  nextNonWhitespace();
-  if (errorIfAtEndOfFile("',' or end of container for next element"))
-    return 0;
-
-  if (*Position == ',') {
-    nextNonWhitespace();
-    if (errorIfAtEndOfFile("element in container"))
-      return StringRef::iterator();
-
-    Element = parseElement(ContainerKind);
-    if (Element == 0)
-      return StringRef::iterator();
-
-    return Position;
-  } else if (*Position == EndChar) {
-      return StringRef::iterator();
-  } else {
-    setExpectedError("',' or end of container for next element", *Position);
-    return StringRef::iterator();
-  }
-}
-
-const JSONAtom *JSONParser::parseElement(JSONAtom::Kind ContainerKind) {
-  switch (ContainerKind) {
-    case JSONAtom::JK_Array:
-      return parseValue();
-    case JSONAtom::JK_Object:
-      return parseKeyValuePair();
-    default:
-      llvm_unreachable("Impossible code path");
-  }
-}
-
-bool JSONParser::skipContainer(const JSONContainer &Container) {
-  for (JSONContainer::AtomIterator I = Container.atom_current(),
-                                   E = Container.atom_end();
-       I != E; ++I) {
-    assert(*I != 0);
-    if (!skip(**I))
-      return false;
-  }
-  return !failed();
-}
diff --git a/lib/Support/Locale.cpp b/lib/Support/Locale.cpp
new file mode 100644
index 0000000..17b9b6c
--- /dev/null
+++ b/lib/Support/Locale.cpp
@@ -0,0 +1,10 @@
+#include "llvm/Support/Locale.h"
+#include "llvm/Config/config.h"
+
+#ifdef __APPLE__
+#include "LocaleXlocale.inc"
+#elif LLVM_ON_WIN32
+#include "LocaleWindows.inc"
+#else
+#include "LocaleGeneric.inc"
+#endif
diff --git a/lib/Support/LocaleGeneric.inc b/lib/Support/LocaleGeneric.inc
new file mode 100644
index 0000000..278deee
--- /dev/null
+++ b/lib/Support/LocaleGeneric.inc
@@ -0,0 +1,17 @@
+#include <cwctype>
+
+namespace llvm {
+namespace sys {
+namespace locale {
+
+int columnWidth(StringRef s) {
+    return s.size();
+}
+
+bool isPrint(int c) {
+    return iswprint(c);
+}
+
+}
+}
+}
diff --git a/lib/Support/LocaleWindows.inc b/lib/Support/LocaleWindows.inc
new file mode 100644
index 0000000..6827ac1
--- /dev/null
+++ b/lib/Support/LocaleWindows.inc
@@ -0,0 +1,15 @@
+namespace llvm {
+namespace sys {
+namespace locale {
+
+int columnWidth(StringRef s) {
+    return s.size();
+}
+
+bool isPrint(int c) {
+    return ' ' <= c && c <= '~';
+}
+
+}
+}
+}
+\ No newline at end of file
diff --git a/lib/Support/LocaleXlocale.inc b/lib/Support/LocaleXlocale.inc
new file mode 100644
index 0000000..f595e7c
--- /dev/null
+++ b/lib/Support/LocaleXlocale.inc
@@ -0,0 +1,61 @@
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/ManagedStatic.h"
+#include <cassert>
+#include <xlocale.h>
+
+
+namespace {
+  struct locale_holder {
+    locale_holder()
+    : l(newlocale(LC_CTYPE_MASK,"en_US.UTF-8",LC_GLOBAL_LOCALE))
+    {
+      assert(NULL!=l);
+    }
+    ~locale_holder() {
+      freelocale(l);
+    }
+
+    int mbswidth(llvm::SmallString<16> s) const {
+       // this implementation assumes no '\0' in s
+      assert(s.size()==strlen(s.c_str()));
+
+      size_t size = mbstowcs_l(NULL,s.c_str(),0,l);
+      assert(size!=(size_t)-1);
+      if (size==0)
+        return 0;
+      llvm::SmallVector<wchar_t,200> ws(size);
+      size = mbstowcs_l(&ws[0],s.c_str(),ws.size(),l);
+      assert(ws.size()==size);
+      return wcswidth_l(&ws[0],ws.size(),l);
+    }
+
+    int isprint(int c) const {
+      return iswprint_l(c,l);
+    }
+
+  private:
+
+    locale_t l;
+  };
+
+  llvm::ManagedStatic<locale_holder> l;
+}
+
+namespace llvm {
+namespace sys {
+namespace locale {
+
+int columnWidth(StringRef s) {
+  int width = l->mbswidth(s);
+  assert(width>=0);
+  return width;
+}
+
+bool isPrint(int c) {
+  return l->isprint(c);
+}
+
+}
+}
+}
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 911a03f..16e5c7a 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -304,6 +304,16 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
                                                       RealMapOffset)) {
       result.reset(GetNamedBuffer<MemoryBufferMMapFile>(
           StringRef(Pages + Delta, MapSize), Filename, RequiresNullTerminator));
+
+      if (RequiresNullTerminator && result->getBufferEnd()[0] != '\0') {
+        // There could be a racing issue that resulted in the file being larger
+        // than the FileSize passed by the caller. We already have an assertion
+        // for this in MemoryBuffer::init() but have a runtime guarantee that
+        // the buffer will be null-terminated here, so do a copy that adds a
+        // null-terminator.
+        result.reset(MemoryBuffer::getMemBufferCopy(result->getBuffer(),
+                                                    Filename));
+      }
       return error_code::success();
     }
   }
@@ -339,6 +349,7 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
     if (NumRead == 0) {
       assert(0 && "We got inaccurate FileSize value or fstat reported an "
                    "invalid file size.");
+      *BufPtr = '\0'; // null-terminate at the actual size.
       break;
     }
     BytesLeft -= NumRead;
diff --git a/lib/Support/PathV2.cpp b/lib/Support/PathV2.cpp
index 786e1a1..e2a69a6 100644
--- a/lib/Support/PathV2.cpp
+++ b/lib/Support/PathV2.cpp
@@ -654,12 +654,13 @@ error_code create_directories(const Twine &path, bool &existed) {
   StringRef p = path.toStringRef(path_storage);
 
   StringRef parent = path::parent_path(p);
-  bool parent_exists;
+  if (!parent.empty()) {
+    bool parent_exists;
+    if (error_code ec = fs::exists(parent, parent_exists)) return ec;
 
-  if (error_code ec = fs::exists(parent, parent_exists)) return ec;
-
-  if (!parent_exists)
-    if (error_code ec = create_directories(parent, existed)) return ec;
+    if (!parent_exists)
+      if (error_code ec = create_directories(parent, existed)) return ec;
+  }
 
   return create_directory(p, existed);
 }
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index 68d9c29..3b53e9f 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <cstdlib>
@@ -102,7 +103,7 @@ bool SmallPtrSetImpl::erase_imp(const void * Ptr) {
 }
 
 const void * const *SmallPtrSetImpl::FindBucketFor(const void *Ptr) const {
-  unsigned Bucket = Hash(Ptr);
+  unsigned Bucket = DenseMapInfo<void *>::getHashValue(Ptr) & (CurArraySize-1);
   unsigned ArraySize = CurArraySize;
   unsigned ProbeAmt = 1;
   const void *const *Array = CurArray;
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index bbe36b2..15278c5 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -193,7 +193,8 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 }
 
 void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
-                             const Twine &Msg, ArrayRef<SMRange> Ranges) const {
+                             const Twine &Msg, ArrayRef<SMRange> Ranges,
+                             bool ShowColors) const {
   SMDiagnostic Diagnostic = GetMessage(Loc, Kind, Msg, Ranges);
   
   // Report the message with the diagnostic handler if present.
@@ -208,7 +209,7 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
   assert(CurBuf != -1 && "Invalid or unspecified location!");
   PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
 
-  Diagnostic.print(0, OS);
+  Diagnostic.print(0, OS, ShowColors);
 }
 
 //===----------------------------------------------------------------------===//
@@ -225,7 +226,14 @@ SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, const std::string &FN,
 }
 
 
-void SMDiagnostic::print(const char *ProgName, raw_ostream &S) const {
+void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
+                         bool ShowColors) const {
+  // Display colors only if OS goes to a tty.
+  ShowColors &= S.is_displayed();
+
+  if (ShowColors)
+    S.changeColor(raw_ostream::SAVEDCOLOR, true);
+
   if (ProgName && ProgName[0])
     S << ProgName << ": ";
 
@@ -244,13 +252,33 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S) const {
   }
 
   switch (Kind) {
-  case SourceMgr::DK_Error: S << "error: "; break;
-  case SourceMgr::DK_Warning: S << "warning: "; break;
-  case SourceMgr::DK_Note: S << "note: "; break;
+  case SourceMgr::DK_Error:
+    if (ShowColors)
+      S.changeColor(raw_ostream::RED, true);
+    S << "error: ";
+    break;
+  case SourceMgr::DK_Warning:
+    if (ShowColors)
+      S.changeColor(raw_ostream::MAGENTA, true);
+    S << "warning: ";
+    break;
+  case SourceMgr::DK_Note:
+    if (ShowColors)
+      S.changeColor(raw_ostream::BLACK, true);
+    S << "note: ";
+    break;
   }
-  
+
+  if (ShowColors) {
+    S.resetColor();
+    S.changeColor(raw_ostream::SAVEDCOLOR, true);
+  }
+
   S << Message << '\n';
 
+  if (ShowColors)
+    S.resetColor();
+
   if (LineNo == -1 || ColumnNo == -1)
     return;
 
@@ -292,6 +320,9 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S) const {
   }
   S << '\n';
 
+  if (ShowColors)
+    S.changeColor(raw_ostream::GREEN, true);
+
   // Print out the caret line, matching tabs in the source line.
   for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
     if (i >= LineContents.size() || LineContents[i] != '\t') {
@@ -306,6 +337,9 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S) const {
       ++OutCol;
     } while (OutCol & 7);
   }
+
+  if (ShowColors)
+    S.resetColor();
   
   S << '\n';
 }
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index d261c53..44a1b38 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -88,6 +88,8 @@ const char *Triple::getVendorTypeName(VendorType Kind) {
   case Apple: return "apple";
   case PC: return "pc";
   case SCEI: return "scei";
+  case BGP: return "bgp";
+  case BGQ: return "bgq";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -116,6 +118,7 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case Minix: return "minix";
   case RTEMS: return "rtems";
   case NativeClient: return "nacl";
+  case CNK: return "cnk";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -258,6 +261,8 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
     .Case("apple", Triple::Apple)
     .Case("pc", Triple::PC)
     .Case("scei", Triple::SCEI)
+    .Case("bgp", Triple::BGP)
+    .Case("bgq", Triple::BGQ)
     .Default(Triple::UnknownVendor);
 }
 
@@ -282,6 +287,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("minix", Triple::Minix)
     .StartsWith("rtems", Triple::RTEMS)
     .StartsWith("nacl", Triple::NativeClient)
+    .StartsWith("cnk", Triple::CNK)
     .Default(Triple::UnknownOS);
 }
 
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 418dc07..ddc1e0f 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -60,6 +60,11 @@
 #include <mach-o/dyld.h>
 #endif
 
+// For GNU Hurd
+#if defined(__GNU__) && !defined(MAXPATHLEN)
+# define MAXPATHLEN 4096
+#endif
+
 // Put in a hack for Cygwin which falsely reports that the mkdtemp function
 // is available when it is not.
 #ifdef __CYGWIN__
@@ -256,7 +261,7 @@ Path::GetCurrentDirectory() {
 }
 
 #if defined(__FreeBSD__) || defined (__NetBSD__) || \
-    defined(__OpenBSD__) || defined(__minix)
+    defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__)
 static int
 test_dir(char buf[PATH_MAX], char ret[PATH_MAX],
     const char *dir, const char *bin)
@@ -308,7 +313,7 @@ getprogpath(char ret[PATH_MAX], const char *bin)
   free(pv);
   return (NULL);
 }
-#endif // __FreeBSD__ || __NetBSD__
+#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
 
 /// GetMainExecutable - Return the path to the main executable, given the
 /// value of argv[0] from program startup.
@@ -325,7 +330,7 @@ Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
       return Path(link_path);
   }
 #elif defined(__FreeBSD__) || defined (__NetBSD__) || \
-      defined(__OpenBSD__) || defined(__minix)
+      defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__)
   char exe_path[PATH_MAX];
 
   if (getprogpath(exe_path, argv0) != NULL)
diff --git a/lib/Support/Unix/PathV2.inc b/lib/Support/Unix/PathV2.inc
index 7d79947..7d259a3 100644
--- a/lib/Support/Unix/PathV2.inc
+++ b/lib/Support/Unix/PathV2.inc
@@ -46,6 +46,11 @@
 #include <limits.h>
 #endif
 
+// For GNU Hurd
+#if defined(__GNU__) && !defined(PATH_MAX)
+# define PATH_MAX 4096
+#endif
+
 extern "C" int truncate (const char*, off_t);
 
 using namespace llvm;
@@ -98,7 +103,12 @@ namespace sys  {
 namespace fs {
 
 error_code current_path(SmallVectorImpl<char> &result) {
+#ifdef MAXPATHLEN
   result.reserve(MAXPATHLEN);
+#else
+// For GNU Hurd
+  result.reserve(1024);
+#endif
 
   while (true) {
     if (::getcwd(result.data(), result.capacity()) == 0) {
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 5cdb11c..f640462 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -136,7 +136,7 @@ int Process::GetCurrentGroupId() {
   return getgid();
 }
 
-#ifdef HAVE_MACH_MACH_H
+#if defined(HAVE_MACH_MACH_H) && !defined(__GNU__)
 #include <mach/mach.h>
 #endif
 
@@ -150,7 +150,7 @@ void Process::PreventCoreFiles() {
   setrlimit(RLIMIT_CORE, &rlim);
 #endif
 
-#ifdef HAVE_MACH_MACH_H
+#if defined(HAVE_MACH_MACH_H) && !defined(__GNU__)
   // Disable crash reporting on Mac OS X 10.0-10.4
 
   // get information about the original set of exception ports for the task
@@ -290,6 +290,10 @@ const char *Process::OutputBold(bool bg) {
   return "\033[1m";
 }
 
+const char *Process::OutputReverse() {
+  return "\033[7m";
+}
+
 const char *Process::ResetColor() {
   return "\033[0m";
 }
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index c3885e1..399f686 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -275,9 +275,9 @@ void llvm::sys::PrintStackTraceOnErrorSignal() {
 
     kern_return_t ret = task_set_exception_ports(self, 
                              mask,
-                             NULL,
+                             MACH_PORT_NULL,
                              EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES, 
-                             NULL);
+                             THREAD_STATE_NONE);
     (void)ret;
   }
 #endif
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 913b073..9a388b4 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -215,6 +215,38 @@ const char *Process::OutputColor(char code, bool bold, bool bg) {
   return 0;
 }
 
+static WORD GetConsoleTextAttribute(HANDLE hConsoleOutput) {
+  CONSOLE_SCREEN_BUFFER_INFO info;
+  GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &info);
+  return info.wAttributes;
+}
+
+const char *Process::OutputReverse() {
+  const WORD attributes
+   = GetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE));
+
+  const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+    FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+    BACKGROUND_RED | BACKGROUND_INTENSITY;
+  const WORD color_mask = foreground_mask | background_mask;
+
+  WORD new_attributes =
+    ((attributes & FOREGROUND_BLUE     )?BACKGROUND_BLUE     :0) |
+    ((attributes & FOREGROUND_GREEN    )?BACKGROUND_GREEN    :0) |
+    ((attributes & FOREGROUND_RED      )?BACKGROUND_RED      :0) |
+    ((attributes & FOREGROUND_INTENSITY)?BACKGROUND_INTENSITY:0) |
+    ((attributes & BACKGROUND_BLUE     )?FOREGROUND_BLUE     :0) |
+    ((attributes & BACKGROUND_GREEN    )?FOREGROUND_GREEN    :0) |
+    ((attributes & BACKGROUND_RED      )?FOREGROUND_RED      :0) |
+    ((attributes & BACKGROUND_INTENSITY)?FOREGROUND_INTENSITY:0) |
+    0;
+  new_attributes = (attributes & ~color_mask) | (new_attributes & color_mask);
+
+  SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), new_attributes);
+  return 0;
+}
+
 const char *Process::ResetColor() {
   SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), defaultColors());
   return 0;
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
new file mode 100644
index 0000000..330519f
--- /dev/null
+++ b/lib/Support/YAMLParser.cpp
@@ -0,0 +1,2117 @@
+//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements a YAML parser.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/YAMLParser.h"
+
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace llvm;
+using namespace yaml;
+
+enum UnicodeEncodingForm {
+  UEF_UTF32_LE, //< UTF-32 Little Endian
+  UEF_UTF32_BE, //< UTF-32 Big Endian
+  UEF_UTF16_LE, //< UTF-16 Little Endian
+  UEF_UTF16_BE, //< UTF-16 Big Endian
+  UEF_UTF8,     //< UTF-8 or ascii.
+  UEF_Unknown   //< Not a valid Unicode encoding.
+};
+
+/// EncodingInfo - Holds the encoding type and length of the byte order mark if
+///                it exists. Length is in {0, 2, 3, 4}.
+typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
+
+/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
+///                      encoding form of \a Input.
+///
+/// @param Input A string of length 0 or more.
+/// @returns An EncodingInfo indicating the Unicode encoding form of the input
+///          and how long the byte order mark is if one exists.
+static EncodingInfo getUnicodeEncoding(StringRef Input) {
+  if (Input.size() == 0)
+    return std::make_pair(UEF_Unknown, 0);
+
+  switch (uint8_t(Input[0])) {
+  case 0x00:
+    if (Input.size() >= 4) {
+      if (  Input[1] == 0
+         && uint8_t(Input[2]) == 0xFE
+         && uint8_t(Input[3]) == 0xFF)
+        return std::make_pair(UEF_UTF32_BE, 4);
+      if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
+        return std::make_pair(UEF_UTF32_BE, 0);
+    }
+
+    if (Input.size() >= 2 && Input[1] != 0)
+      return std::make_pair(UEF_UTF16_BE, 0);
+    return std::make_pair(UEF_Unknown, 0);
+  case 0xFF:
+    if (  Input.size() >= 4
+       && uint8_t(Input[1]) == 0xFE
+       && Input[2] == 0
+       && Input[3] == 0)
+      return std::make_pair(UEF_UTF32_LE, 4);
+
+    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
+      return std::make_pair(UEF_UTF16_LE, 2);
+    return std::make_pair(UEF_Unknown, 0);
+  case 0xFE:
+    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
+      return std::make_pair(UEF_UTF16_BE, 2);
+    return std::make_pair(UEF_Unknown, 0);
+  case 0xEF:
+    if (  Input.size() >= 3
+       && uint8_t(Input[1]) == 0xBB
+       && uint8_t(Input[2]) == 0xBF)
+      return std::make_pair(UEF_UTF8, 3);
+    return std::make_pair(UEF_Unknown, 0);
+  }
+
+  // It could still be utf-32 or utf-16.
+  if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
+    return std::make_pair(UEF_UTF32_LE, 0);
+
+  if (Input.size() >= 2 && Input[1] == 0)
+    return std::make_pair(UEF_UTF16_LE, 0);
+
+  return std::make_pair(UEF_UTF8, 0);
+}
+
+namespace llvm {
+namespace yaml {
+/// Token - A single YAML token.
+struct Token : ilist_node<Token> {
+  enum TokenKind {
+    TK_Error, // Uninitialized token.
+    TK_StreamStart,
+    TK_StreamEnd,
+    TK_VersionDirective,
+    TK_TagDirective,
+    TK_DocumentStart,
+    TK_DocumentEnd,
+    TK_BlockEntry,
+    TK_BlockEnd,
+    TK_BlockSequenceStart,
+    TK_BlockMappingStart,
+    TK_FlowEntry,
+    TK_FlowSequenceStart,
+    TK_FlowSequenceEnd,
+    TK_FlowMappingStart,
+    TK_FlowMappingEnd,
+    TK_Key,
+    TK_Value,
+    TK_Scalar,
+    TK_Alias,
+    TK_Anchor,
+    TK_Tag
+  } Kind;
+
+  /// A string of length 0 or more whose begin() points to the logical location
+  /// of the token in the input.
+  StringRef Range;
+
+  Token() : Kind(TK_Error) {}
+};
+}
+}
+
+namespace llvm {
+template<>
+struct ilist_sentinel_traits<Token> {
+  Token *createSentinel() const {
+    return &Sentinel;
+  }
+  static void destroySentinel(Token*) {}
+
+  Token *provideInitialHead() const { return createSentinel(); }
+  Token *ensureHead(Token*) const { return createSentinel(); }
+  static void noteHead(Token*, Token*) {}
+
+private:
+  mutable Token Sentinel;
+};
+
+template<>
+struct ilist_node_traits<Token> {
+  Token *createNode(const Token &V) {
+    return new (Alloc.Allocate<Token>()) Token(V);
+  }
+  static void deleteNode(Token *V) {}
+
+  void addNodeToList(Token *) {}
+  void removeNodeFromList(Token *) {}
+  void transferNodesFromList(ilist_node_traits &    /*SrcTraits*/,
+                             ilist_iterator<Token> /*first*/,
+                             ilist_iterator<Token> /*last*/) {}
+
+  BumpPtrAllocator Alloc;
+};
+}
+
+typedef ilist<Token> TokenQueueT;
+
+namespace {
+/// @brief This struct is used to track simple keys.
+///
+/// Simple keys are handled by creating an entry in SimpleKeys for each Token
+/// which could legally be the start of a simple key. When peekNext is called,
+/// if the Token To be returned is referenced by a SimpleKey, we continue
+/// tokenizing until that potential simple key has either been found to not be
+/// a simple key (we moved on to the next line or went further than 1024 chars).
+/// Or when we run into a Value, and then insert a Key token (and possibly
+/// others) before the SimpleKey's Tok.
+struct SimpleKey {
+  TokenQueueT::iterator Tok;
+  unsigned Column;
+  unsigned Line;
+  unsigned FlowLevel;
+  bool IsRequired;
+
+  bool operator ==(const SimpleKey &Other) {
+    return Tok == Other.Tok;
+  }
+};
+}
+
+/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
+///        subsequence and the subsequence's length in code units (uint8_t).
+///        A length of 0 represents an error.
+typedef std::pair<uint32_t, unsigned> UTF8Decoded;
+
+static UTF8Decoded decodeUTF8(StringRef Range) {
+  StringRef::iterator Position= Range.begin();
+  StringRef::iterator End = Range.end();
+  // 1 byte: [0x00, 0x7f]
+  // Bit pattern: 0xxxxxxx
+  if ((*Position & 0x80) == 0) {
+     return std::make_pair(*Position, 1);
+  }
+  // 2 bytes: [0x80, 0x7ff]
+  // Bit pattern: 110xxxxx 10xxxxxx
+  if (Position + 1 != End &&
+      ((*Position & 0xE0) == 0xC0) &&
+      ((*(Position + 1) & 0xC0) == 0x80)) {
+    uint32_t codepoint = ((*Position & 0x1F) << 6) |
+                          (*(Position + 1) & 0x3F);
+    if (codepoint >= 0x80)
+      return std::make_pair(codepoint, 2);
+  }
+  // 3 bytes: [0x8000, 0xffff]
+  // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
+  if (Position + 2 != End &&
+      ((*Position & 0xF0) == 0xE0) &&
+      ((*(Position + 1) & 0xC0) == 0x80) &&
+      ((*(Position + 2) & 0xC0) == 0x80)) {
+    uint32_t codepoint = ((*Position & 0x0F) << 12) |
+                         ((*(Position + 1) & 0x3F) << 6) |
+                          (*(Position + 2) & 0x3F);
+    // Codepoints between 0xD800 and 0xDFFF are invalid, as
+    // they are high / low surrogate halves used by UTF-16.
+    if (codepoint >= 0x800 &&
+        (codepoint < 0xD800 || codepoint > 0xDFFF))
+      return std::make_pair(codepoint, 3);
+  }
+  // 4 bytes: [0x10000, 0x10FFFF]
+  // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+  if (Position + 3 != End &&
+      ((*Position & 0xF8) == 0xF0) &&
+      ((*(Position + 1) & 0xC0) == 0x80) &&
+      ((*(Position + 2) & 0xC0) == 0x80) &&
+      ((*(Position + 3) & 0xC0) == 0x80)) {
+    uint32_t codepoint = ((*Position & 0x07) << 18) |
+                         ((*(Position + 1) & 0x3F) << 12) |
+                         ((*(Position + 2) & 0x3F) << 6) |
+                          (*(Position + 3) & 0x3F);
+    if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
+      return std::make_pair(codepoint, 4);
+  }
+  return std::make_pair(0, 0);
+}
+
+namespace llvm {
+namespace yaml {
+/// @brief Scans YAML tokens from a MemoryBuffer.
+class Scanner {
+public:
+  Scanner(const StringRef Input, SourceMgr &SM);
+
+  /// @brief Parse the next token and return it without popping it.
+  Token &peekNext();
+
+  /// @brief Parse the next token and pop it from the queue.
+  Token getNext();
+
+  void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
+                  ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+    SM.PrintMessage(Loc, Kind, Message, Ranges);
+  }
+
+  void setError(const Twine &Message, StringRef::iterator Position) {
+    if (Current >= End)
+      Current = End - 1;
+
+    // Don't print out more errors after the first one we encounter. The rest
+    // are just the result of the first, and have no meaning.
+    if (!Failed)
+      printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
+    Failed = true;
+  }
+
+  void setError(const Twine &Message) {
+    setError(Message, Current);
+  }
+
+  /// @brief Returns true if an error occurred while parsing.
+  bool failed() {
+    return Failed;
+  }
+
+private:
+  StringRef currentInput() {
+    return StringRef(Current, End - Current);
+  }
+
+  /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
+  ///        at \a Position.
+  ///
+  /// If the UTF-8 code units starting at Position do not form a well-formed
+  /// code unit subsequence, then the Unicode scalar value is 0, and the length
+  /// is 0.
+  UTF8Decoded decodeUTF8(StringRef::iterator Position) {
+    return ::decodeUTF8(StringRef(Position, End - Position));
+  }
+
+  // The following functions are based on the gramar rules in the YAML spec. The
+  // style of the function names it meant to closely match how they are written
+  // in the spec. The number within the [] is the number of the grammar rule in
+  // the spec.
+  //
+  // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
+  //
+  // c-
+  //   A production starting and ending with a special character.
+  // b-
+  //   A production matching a single line break.
+  // nb-
+  //   A production starting and ending with a non-break character.
+  // s-
+  //   A production starting and ending with a white space character.
+  // ns-
+  //   A production starting and ending with a non-space character.
+  // l-
+  //   A production matching complete line(s).
+
+  /// @brief Skip a single nb-char[27] starting at Position.
+  ///
+  /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
+  ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
+  ///
+  /// @returns The code unit after the nb-char, or Position if it's not an
+  ///          nb-char.
+  StringRef::iterator skip_nb_char(StringRef::iterator Position);
+
+  /// @brief Skip a single b-break[28] starting at Position.
+  ///
+  /// A b-break is 0xD 0xA | 0xD | 0xA
+  ///
+  /// @returns The code unit after the b-break, or Position if it's not a
+  ///          b-break.
+  StringRef::iterator skip_b_break(StringRef::iterator Position);
+
+  /// @brief Skip a single s-white[33] starting at Position.
+  ///
+  /// A s-white is 0x20 | 0x9
+  ///
+  /// @returns The code unit after the s-white, or Position if it's not a
+  ///          s-white.
+  StringRef::iterator skip_s_white(StringRef::iterator Position);
+
+  /// @brief Skip a single ns-char[34] starting at Position.
+  ///
+  /// A ns-char is nb-char - s-white
+  ///
+  /// @returns The code unit after the ns-char, or Position if it's not a
+  ///          ns-char.
+  StringRef::iterator skip_ns_char(StringRef::iterator Position);
+
+  typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
+  /// @brief Skip minimal well-formed code unit subsequences until Func
+  ///        returns its input.
+  ///
+  /// @returns The code unit after the last minimal well-formed code unit
+  ///          subsequence that Func accepted.
+  StringRef::iterator skip_while( SkipWhileFunc Func
+                                , StringRef::iterator Position);
+
+  /// @brief Scan ns-uri-char[39]s starting at Cur.
+  ///
+  /// This updates Cur and Column while scanning.
+  ///
+  /// @returns A StringRef starting at Cur which covers the longest contiguous
+  ///          sequence of ns-uri-char.
+  StringRef scan_ns_uri_char();
+
+  /// @brief Scan ns-plain-one-line[133] starting at \a Cur.
+  StringRef scan_ns_plain_one_line();
+
+  /// @brief Consume a minimal well-formed code unit subsequence starting at
+  ///        \a Cur. Return false if it is not the same Unicode scalar value as
+  ///        \a Expected. This updates \a Column.
+  bool consume(uint32_t Expected);
+
+  /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
+  void skip(uint32_t Distance);
+
+  /// @brief Return true if the minimal well-formed code unit subsequence at
+  ///        Pos is whitespace or a new line
+  bool isBlankOrBreak(StringRef::iterator Position);
+
+  /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
+  void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
+                             , unsigned AtColumn
+                             , bool IsRequired);
+
+  /// @brief Remove simple keys that can no longer be valid simple keys.
+  ///
+  /// Invalid simple keys are not on the current line or are further than 1024
+  /// columns back.
+  void removeStaleSimpleKeyCandidates();
+
+  /// @brief Remove all simple keys on FlowLevel \a Level.
+  void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
+
+  /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
+  ///        tokens if needed.
+  bool unrollIndent(int ToColumn);
+
+  /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
+  ///        if needed.
+  bool rollIndent( int ToColumn
+                 , Token::TokenKind Kind
+                 , TokenQueueT::iterator InsertPoint);
+
+  /// @brief Skip whitespace and comments until the start of the next token.
+  void scanToNextToken();
+
+  /// @brief Must be the first token generated.
+  bool scanStreamStart();
+
+  /// @brief Generate tokens needed to close out the stream.
+  bool scanStreamEnd();
+
+  /// @brief Scan a %BLAH directive.
+  bool scanDirective();
+
+  /// @brief Scan a ... or ---.
+  bool scanDocumentIndicator(bool IsStart);
+
+  /// @brief Scan a [ or { and generate the proper flow collection start token.
+  bool scanFlowCollectionStart(bool IsSequence);
+
+  /// @brief Scan a ] or } and generate the proper flow collection end token.
+  bool scanFlowCollectionEnd(bool IsSequence);
+
+  /// @brief Scan the , that separates entries in a flow collection.
+  bool scanFlowEntry();
+
+  /// @brief Scan the - that starts block sequence entries.
+  bool scanBlockEntry();
+
+  /// @brief Scan an explicit ? indicating a key.
+  bool scanKey();
+
+  /// @brief Scan an explicit : indicating a value.
+  bool scanValue();
+
+  /// @brief Scan a quoted scalar.
+  bool scanFlowScalar(bool IsDoubleQuoted);
+
+  /// @brief Scan an unquoted scalar.
+  bool scanPlainScalar();
+
+  /// @brief Scan an Alias or Anchor starting with * or &.
+  bool scanAliasOrAnchor(bool IsAlias);
+
+  /// @brief Scan a block scalar starting with | or >.
+  bool scanBlockScalar(bool IsLiteral);
+
+  /// @brief Scan a tag of the form !stuff.
+  bool scanTag();
+
+  /// @brief Dispatch to the next scanning function based on \a *Cur.
+  bool fetchMoreTokens();
+
+  /// @brief The SourceMgr used for diagnostics and buffer management.
+  SourceMgr &SM;
+
+  /// @brief The original input.
+  MemoryBuffer *InputBuffer;
+
+  /// @brief The current position of the scanner.
+  StringRef::iterator Current;
+
+  /// @brief The end of the input (one past the last character).
+  StringRef::iterator End;
+
+  /// @brief Current YAML indentation level in spaces.
+  int Indent;
+
+  /// @brief Current column number in Unicode code points.
+  unsigned Column;
+
+  /// @brief Current line number.
+  unsigned Line;
+
+  /// @brief How deep we are in flow style containers. 0 Means at block level.
+  unsigned FlowLevel;
+
+  /// @brief Are we at the start of the stream?
+  bool IsStartOfStream;
+
+  /// @brief Can the next token be the start of a simple key?
+  bool IsSimpleKeyAllowed;
+
+  /// @brief Is the next token required to start a simple key?
+  bool IsSimpleKeyRequired;
+
+  /// @brief True if an error has occurred.
+  bool Failed;
+
+  /// @brief Queue of tokens. This is required to queue up tokens while looking
+  ///        for the end of a simple key. And for cases where a single character
+  ///        can produce multiple tokens (e.g. BlockEnd).
+  TokenQueueT TokenQueue;
+
+  /// @brief Indentation levels.
+  SmallVector<int, 4> Indents;
+
+  /// @brief Potential simple keys.
+  SmallVector<SimpleKey, 4> SimpleKeys;
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
+static void encodeUTF8( uint32_t UnicodeScalarValue
+                      , SmallVectorImpl<char> &Result) {
+  if (UnicodeScalarValue <= 0x7F) {
+    Result.push_back(UnicodeScalarValue & 0x7F);
+  } else if (UnicodeScalarValue <= 0x7FF) {
+    uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
+    uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
+    Result.push_back(FirstByte);
+    Result.push_back(SecondByte);
+  } else if (UnicodeScalarValue <= 0xFFFF) {
+    uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
+    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
+    uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
+    Result.push_back(FirstByte);
+    Result.push_back(SecondByte);
+    Result.push_back(ThirdByte);
+  } else if (UnicodeScalarValue <= 0x10FFFF) {
+    uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
+    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
+    uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
+    uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
+    Result.push_back(FirstByte);
+    Result.push_back(SecondByte);
+    Result.push_back(ThirdByte);
+    Result.push_back(FourthByte);
+  }
+}
+
+bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
+  SourceMgr SM;
+  Scanner scanner(Input, SM);
+  while (true) {
+    Token T = scanner.getNext();
+    switch (T.Kind) {
+    case Token::TK_StreamStart:
+      OS << "Stream-Start: ";
+      break;
+    case Token::TK_StreamEnd:
+      OS << "Stream-End: ";
+      break;
+    case Token::TK_VersionDirective:
+      OS << "Version-Directive: ";
+      break;
+    case Token::TK_TagDirective:
+      OS << "Tag-Directive: ";
+      break;
+    case Token::TK_DocumentStart:
+      OS << "Document-Start: ";
+      break;
+    case Token::TK_DocumentEnd:
+      OS << "Document-End: ";
+      break;
+    case Token::TK_BlockEntry:
+      OS << "Block-Entry: ";
+      break;
+    case Token::TK_BlockEnd:
+      OS << "Block-End: ";
+      break;
+    case Token::TK_BlockSequenceStart:
+      OS << "Block-Sequence-Start: ";
+      break;
+    case Token::TK_BlockMappingStart:
+      OS << "Block-Mapping-Start: ";
+      break;
+    case Token::TK_FlowEntry:
+      OS << "Flow-Entry: ";
+      break;
+    case Token::TK_FlowSequenceStart:
+      OS << "Flow-Sequence-Start: ";
+      break;
+    case Token::TK_FlowSequenceEnd:
+      OS << "Flow-Sequence-End: ";
+      break;
+    case Token::TK_FlowMappingStart:
+      OS << "Flow-Mapping-Start: ";
+      break;
+    case Token::TK_FlowMappingEnd:
+      OS << "Flow-Mapping-End: ";
+      break;
+    case Token::TK_Key:
+      OS << "Key: ";
+      break;
+    case Token::TK_Value:
+      OS << "Value: ";
+      break;
+    case Token::TK_Scalar:
+      OS << "Scalar: ";
+      break;
+    case Token::TK_Alias:
+      OS << "Alias: ";
+      break;
+    case Token::TK_Anchor:
+      OS << "Anchor: ";
+      break;
+    case Token::TK_Tag:
+      OS << "Tag: ";
+      break;
+    case Token::TK_Error:
+      break;
+    }
+    OS << T.Range << "\n";
+    if (T.Kind == Token::TK_StreamEnd)
+      break;
+    else if (T.Kind == Token::TK_Error)
+      return false;
+  }
+  return true;
+}
+
+bool yaml::scanTokens(StringRef Input) {
+  llvm::SourceMgr SM;
+  llvm::yaml::Scanner scanner(Input, SM);
+  for (;;) {
+    llvm::yaml::Token T = scanner.getNext();
+    if (T.Kind == Token::TK_StreamEnd)
+      break;
+    else if (T.Kind == Token::TK_Error)
+      return false;
+  }
+  return true;
+}
+
+std::string yaml::escape(StringRef Input) {
+  std::string EscapedInput;
+  for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
+    if (*i == '\\')
+      EscapedInput += "\\\\";
+    else if (*i == '"')
+      EscapedInput += "\\\"";
+    else if (*i == 0)
+      EscapedInput += "\\0";
+    else if (*i == 0x07)
+      EscapedInput += "\\a";
+    else if (*i == 0x08)
+      EscapedInput += "\\b";
+    else if (*i == 0x09)
+      EscapedInput += "\\t";
+    else if (*i == 0x0A)
+      EscapedInput += "\\n";
+    else if (*i == 0x0B)
+      EscapedInput += "\\v";
+    else if (*i == 0x0C)
+      EscapedInput += "\\f";
+    else if (*i == 0x0D)
+      EscapedInput += "\\r";
+    else if (*i == 0x1B)
+      EscapedInput += "\\e";
+    else if (*i >= 0 && *i < 0x20) { // Control characters not handled above.
+      std::string HexStr = utohexstr(*i);
+      EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
+    } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
+      UTF8Decoded UnicodeScalarValue
+        = decodeUTF8(StringRef(i, Input.end() - i));
+      if (UnicodeScalarValue.second == 0) {
+        // Found invalid char.
+        SmallString<4> Val;
+        encodeUTF8(0xFFFD, Val);
+        EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
+        // FIXME: Error reporting.
+        return EscapedInput;
+      }
+      if (UnicodeScalarValue.first == 0x85)
+        EscapedInput += "\\N";
+      else if (UnicodeScalarValue.first == 0xA0)
+        EscapedInput += "\\_";
+      else if (UnicodeScalarValue.first == 0x2028)
+        EscapedInput += "\\L";
+      else if (UnicodeScalarValue.first == 0x2029)
+        EscapedInput += "\\P";
+      else {
+        std::string HexStr = utohexstr(UnicodeScalarValue.first);
+        if (HexStr.size() <= 2)
+          EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
+        else if (HexStr.size() <= 4)
+          EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
+        else if (HexStr.size() <= 8)
+          EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
+      }
+      i += UnicodeScalarValue.second - 1;
+    } else
+      EscapedInput.push_back(*i);
+  }
+  return EscapedInput;
+}
+
+Scanner::Scanner(StringRef Input, SourceMgr &sm)
+  : SM(sm)
+  , Indent(-1)
+  , Column(0)
+  , Line(0)
+  , FlowLevel(0)
+  , IsStartOfStream(true)
+  , IsSimpleKeyAllowed(true)
+  , IsSimpleKeyRequired(false)
+  , Failed(false) {
+  InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
+  SM.AddNewSourceBuffer(InputBuffer, SMLoc());
+  Current = InputBuffer->getBufferStart();
+  End = InputBuffer->getBufferEnd();
+}
+
+Token &Scanner::peekNext() {
+  // If the current token is a possible simple key, keep parsing until we
+  // can confirm.
+  bool NeedMore = false;
+  while (true) {
+    if (TokenQueue.empty() || NeedMore) {
+      if (!fetchMoreTokens()) {
+        TokenQueue.clear();
+        TokenQueue.push_back(Token());
+        return TokenQueue.front();
+      }
+    }
+    assert(!TokenQueue.empty() &&
+            "fetchMoreTokens lied about getting tokens!");
+
+    removeStaleSimpleKeyCandidates();
+    SimpleKey SK;
+    SK.Tok = TokenQueue.front();
+    if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
+        == SimpleKeys.end())
+      break;
+    else
+      NeedMore = true;
+  }
+  return TokenQueue.front();
+}
+
+Token Scanner::getNext() {
+  Token Ret = peekNext();
+  // TokenQueue can be empty if there was an error getting the next token.
+  if (!TokenQueue.empty())
+    TokenQueue.pop_front();
+
+  // There cannot be any referenced Token's if the TokenQueue is empty. So do a
+  // quick deallocation of them all.
+  if (TokenQueue.empty()) {
+    TokenQueue.Alloc.Reset();
+  }
+
+  return Ret;
+}
+
+StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
+  // Check 7 bit c-printable - b-char.
+  if (   *Position == 0x09
+      || (*Position >= 0x20 && *Position <= 0x7E))
+    return Position + 1;
+
+  // Check for valid UTF-8.
+  if (uint8_t(*Position) & 0x80) {
+    UTF8Decoded u8d = decodeUTF8(Position);
+    if (   u8d.second != 0
+        && u8d.first != 0xFEFF
+        && ( u8d.first == 0x85
+          || ( u8d.first >= 0xA0
+            && u8d.first <= 0xD7FF)
+          || ( u8d.first >= 0xE000
+            && u8d.first <= 0xFFFD)
+          || ( u8d.first >= 0x10000
+            && u8d.first <= 0x10FFFF)))
+      return Position + u8d.second;
+  }
+  return Position;
+}
+
+StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
+  if (*Position == 0x0D) {
+    if (Position + 1 != End && *(Position + 1) == 0x0A)
+      return Position + 2;
+    return Position + 1;
+  }
+
+  if (*Position == 0x0A)
+    return Position + 1;
+  return Position;
+}
+
+
+StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
+  if (*Position == ' ' || *Position == '\t')
+    return Position + 1;
+  return Position;
+}
+
+StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
+  if (*Position == ' ' || *Position == '\t')
+    return Position;
+  return skip_nb_char(Position);
+}
+
+StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
+                                       , StringRef::iterator Position) {
+  while (true) {
+    StringRef::iterator i = (this->*Func)(Position);
+    if (i == Position)
+      break;
+    Position = i;
+  }
+  return Position;
+}
+
+static bool is_ns_hex_digit(const char C) {
+  return    (C >= '0' && C <= '9')
+         || (C >= 'a' && C <= 'z')
+         || (C >= 'A' && C <= 'Z');
+}
+
+static bool is_ns_word_char(const char C) {
+  return    C == '-'
+         || (C >= 'a' && C <= 'z')
+         || (C >= 'A' && C <= 'Z');
+}
+
+StringRef Scanner::scan_ns_uri_char() {
+  StringRef::iterator Start = Current;
+  while (true) {
+    if (Current == End)
+      break;
+    if ((   *Current == '%'
+          && Current + 2 < End
+          && is_ns_hex_digit(*(Current + 1))
+          && is_ns_hex_digit(*(Current + 2)))
+        || is_ns_word_char(*Current)
+        || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
+          != StringRef::npos) {
+      ++Current;
+      ++Column;
+    } else
+      break;
+  }
+  return StringRef(Start, Current - Start);
+}
+
+StringRef Scanner::scan_ns_plain_one_line() {
+  StringRef::iterator start = Current;
+  // The first character must already be verified.
+  ++Current;
+  while (true) {
+    if (Current == End) {
+      break;
+    } else if (*Current == ':') {
+      // Check if the next character is a ns-char.
+      if (Current + 1 == End)
+        break;
+      StringRef::iterator i = skip_ns_char(Current + 1);
+      if (Current + 1 != i) {
+        Current = i;
+        Column += 2; // Consume both the ':' and ns-char.
+      } else
+        break;
+    } else if (*Current == '#') {
+      // Check if the previous character was a ns-char.
+      // The & 0x80 check is to check for the trailing byte of a utf-8
+      if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) {
+        ++Current;
+        ++Column;
+      } else
+        break;
+    } else {
+      StringRef::iterator i = skip_nb_char(Current);
+      if (i == Current)
+        break;
+      Current = i;
+      ++Column;
+    }
+  }
+  return StringRef(start, Current - start);
+}
+
+bool Scanner::consume(uint32_t Expected) {
+  if (Expected >= 0x80)
+    report_fatal_error("Not dealing with this yet");
+  if (Current == End)
+    return false;
+  if (uint8_t(*Current) >= 0x80)
+    report_fatal_error("Not dealing with this yet");
+  if (uint8_t(*Current) == Expected) {
+    ++Current;
+    ++Column;
+    return true;
+  }
+  return false;
+}
+
+void Scanner::skip(uint32_t Distance) {
+  Current += Distance;
+  Column += Distance;
+}
+
+bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
+  if (Position == End)
+    return false;
+  if (   *Position == ' ' || *Position == '\t'
+      || *Position == '\r' || *Position == '\n')
+    return true;
+  return false;
+}
+
+void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
+                                    , unsigned AtColumn
+                                    , bool IsRequired) {
+  if (IsSimpleKeyAllowed) {
+    SimpleKey SK;
+    SK.Tok = Tok;
+    SK.Line = Line;
+    SK.Column = AtColumn;
+    SK.IsRequired = IsRequired;
+    SK.FlowLevel = FlowLevel;
+    SimpleKeys.push_back(SK);
+  }
+}
+
+void Scanner::removeStaleSimpleKeyCandidates() {
+  for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
+                                            i != SimpleKeys.end();) {
+    if (i->Line != Line || i->Column + 1024 < Column) {
+      if (i->IsRequired)
+        setError( "Could not find expected : for simple key"
+                , i->Tok->Range.begin());
+      i = SimpleKeys.erase(i);
+    } else
+      ++i;
+  }
+}
+
+void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
+  if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
+    SimpleKeys.pop_back();
+}
+
+bool Scanner::unrollIndent(int ToColumn) {
+  Token T;
+  // Indentation is ignored in flow.
+  if (FlowLevel != 0)
+    return true;
+
+  while (Indent > ToColumn) {
+    T.Kind = Token::TK_BlockEnd;
+    T.Range = StringRef(Current, 1);
+    TokenQueue.push_back(T);
+    Indent = Indents.pop_back_val();
+  }
+
+  return true;
+}
+
+bool Scanner::rollIndent( int ToColumn
+                        , Token::TokenKind Kind
+                        , TokenQueueT::iterator InsertPoint) {
+  if (FlowLevel)
+    return true;
+  if (Indent < ToColumn) {
+    Indents.push_back(Indent);
+    Indent = ToColumn;
+
+    Token T;
+    T.Kind = Kind;
+    T.Range = StringRef(Current, 0);
+    TokenQueue.insert(InsertPoint, T);
+  }
+  return true;
+}
+
+void Scanner::scanToNextToken() {
+  while (true) {
+    while (*Current == ' ' || *Current == '\t') {
+      skip(1);
+    }
+
+    // Skip comment.
+    if (*Current == '#') {
+      while (true) {
+        // This may skip more than one byte, thus Column is only incremented
+        // for code points.
+        StringRef::iterator i = skip_nb_char(Current);
+        if (i == Current)
+          break;
+        Current = i;
+        ++Column;
+      }
+    }
+
+    // Skip EOL.
+    StringRef::iterator i = skip_b_break(Current);
+    if (i == Current)
+      break;
+    Current = i;
+    ++Line;
+    Column = 0;
+    // New lines may start a simple key.
+    if (!FlowLevel)
+      IsSimpleKeyAllowed = true;
+  }
+}
+
+bool Scanner::scanStreamStart() {
+  IsStartOfStream = false;
+
+  EncodingInfo EI = getUnicodeEncoding(currentInput());
+
+  Token T;
+  T.Kind = Token::TK_StreamStart;
+  T.Range = StringRef(Current, EI.second);
+  TokenQueue.push_back(T);
+  Current += EI.second;
+  return true;
+}
+
+bool Scanner::scanStreamEnd() {
+  // Force an ending new line if one isn't present.
+  if (Column != 0) {
+    Column = 0;
+    ++Line;
+  }
+
+  unrollIndent(-1);
+  SimpleKeys.clear();
+  IsSimpleKeyAllowed = false;
+
+  Token T;
+  T.Kind = Token::TK_StreamEnd;
+  T.Range = StringRef(Current, 0);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanDirective() {
+  // Reset the indentation level.
+  unrollIndent(-1);
+  SimpleKeys.clear();
+  IsSimpleKeyAllowed = false;
+
+  StringRef::iterator Start = Current;
+  consume('%');
+  StringRef::iterator NameStart = Current;
+  Current = skip_while(&Scanner::skip_ns_char, Current);
+  StringRef Name(NameStart, Current - NameStart);
+  Current = skip_while(&Scanner::skip_s_white, Current);
+
+  if (Name == "YAML") {
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+    Token T;
+    T.Kind = Token::TK_VersionDirective;
+    T.Range = StringRef(Start, Current - Start);
+    TokenQueue.push_back(T);
+    return true;
+  }
+  return false;
+}
+
+bool Scanner::scanDocumentIndicator(bool IsStart) {
+  unrollIndent(-1);
+  SimpleKeys.clear();
+  IsSimpleKeyAllowed = false;
+
+  Token T;
+  T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
+  T.Range = StringRef(Current, 3);
+  skip(3);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanFlowCollectionStart(bool IsSequence) {
+  Token T;
+  T.Kind = IsSequence ? Token::TK_FlowSequenceStart
+                      : Token::TK_FlowMappingStart;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+
+  // [ and { may begin a simple key.
+  saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false);
+
+  // And may also be followed by a simple key.
+  IsSimpleKeyAllowed = true;
+  ++FlowLevel;
+  return true;
+}
+
+bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
+  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+  IsSimpleKeyAllowed = false;
+  Token T;
+  T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
+                      : Token::TK_FlowMappingEnd;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  if (FlowLevel)
+    --FlowLevel;
+  return true;
+}
+
+bool Scanner::scanFlowEntry() {
+  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+  IsSimpleKeyAllowed = true;
+  Token T;
+  T.Kind = Token::TK_FlowEntry;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanBlockEntry() {
+  rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
+  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+  IsSimpleKeyAllowed = true;
+  Token T;
+  T.Kind = Token::TK_BlockEntry;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanKey() {
+  if (!FlowLevel)
+    rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
+
+  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+  IsSimpleKeyAllowed = !FlowLevel;
+
+  Token T;
+  T.Kind = Token::TK_Key;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanValue() {
+  // If the previous token could have been a simple key, insert the key token
+  // into the token queue.
+  if (!SimpleKeys.empty()) {
+    SimpleKey SK = SimpleKeys.pop_back_val();
+    Token T;
+    T.Kind = Token::TK_Key;
+    T.Range = SK.Tok->Range;
+    TokenQueueT::iterator i, e;
+    for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
+      if (i == SK.Tok)
+        break;
+    }
+    assert(i != e && "SimpleKey not in token queue!");
+    i = TokenQueue.insert(i, T);
+
+    // We may also need to add a Block-Mapping-Start token.
+    rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
+
+    IsSimpleKeyAllowed = false;
+  } else {
+    if (!FlowLevel)
+      rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
+    IsSimpleKeyAllowed = !FlowLevel;
+  }
+
+  Token T;
+  T.Kind = Token::TK_Value;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+// Forbidding inlining improves performance by roughly 20%.
+// FIXME: Remove once llvm optimizes this to the faster version without hints.
+LLVM_ATTRIBUTE_NOINLINE static bool
+wasEscaped(StringRef::iterator First, StringRef::iterator Position);
+
+// Returns whether a character at 'Position' was escaped with a leading '\'.
+// 'First' specifies the position of the first character in the string.
+static bool wasEscaped(StringRef::iterator First,
+                       StringRef::iterator Position) {
+  assert(Position - 1 >= First);
+  StringRef::iterator I = Position - 1;
+  // We calculate the number of consecutive '\'s before the current position
+  // by iterating backwards through our string.
+  while (I >= First && *I == '\\') --I;
+  // (Position - 1 - I) now contains the number of '\'s before the current
+  // position. If it is odd, the character at 'Position' was escaped.
+  return (Position - 1 - I) % 2 == 1;
+}
+
+bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
+  StringRef::iterator Start = Current;
+  unsigned ColStart = Column;
+  if (IsDoubleQuoted) {
+    do {
+      ++Current;
+      while (Current != End && *Current != '"')
+        ++Current;
+      // Repeat until the previous character was not a '\' or was an escaped
+      // backslash.
+    } while (*(Current - 1) == '\\' && wasEscaped(Start + 1, Current));
+  } else {
+    skip(1);
+    while (true) {
+      // Skip a ' followed by another '.
+      if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
+        skip(2);
+        continue;
+      } else if (*Current == '\'')
+        break;
+      StringRef::iterator i = skip_nb_char(Current);
+      if (i == Current) {
+        i = skip_b_break(Current);
+        if (i == Current)
+          break;
+        Current = i;
+        Column = 0;
+        ++Line;
+      } else {
+        if (i == End)
+          break;
+        Current = i;
+        ++Column;
+      }
+    }
+  }
+  skip(1); // Skip ending quote.
+  Token T;
+  T.Kind = Token::TK_Scalar;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+
+  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+
+  IsSimpleKeyAllowed = false;
+
+  return true;
+}
+
+bool Scanner::scanPlainScalar() {
+  StringRef::iterator Start = Current;
+  unsigned ColStart = Column;
+  unsigned LeadingBlanks = 0;
+  assert(Indent >= -1 && "Indent must be >= -1 !");
+  unsigned indent = static_cast<unsigned>(Indent + 1);
+  while (true) {
+    if (*Current == '#')
+      break;
+
+    while (!isBlankOrBreak(Current)) {
+      if (  FlowLevel && *Current == ':'
+          && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
+        setError("Found unexpected ':' while scanning a plain scalar", Current);
+        return false;
+      }
+
+      // Check for the end of the plain scalar.
+      if (  (*Current == ':' && isBlankOrBreak(Current + 1))
+          || (  FlowLevel
+          && (StringRef(Current, 1).find_first_of(",:?[]{}")
+              != StringRef::npos)))
+        break;
+
+      StringRef::iterator i = skip_nb_char(Current);
+      if (i == Current)
+        break;
+      Current = i;
+      ++Column;
+    }
+
+    // Are we at the end?
+    if (!isBlankOrBreak(Current))
+      break;
+
+    // Eat blanks.
+    StringRef::iterator Tmp = Current;
+    while (isBlankOrBreak(Tmp)) {
+      StringRef::iterator i = skip_s_white(Tmp);
+      if (i != Tmp) {
+        if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
+          setError("Found invalid tab character in indentation", Tmp);
+          return false;
+        }
+        Tmp = i;
+        ++Column;
+      } else {
+        i = skip_b_break(Tmp);
+        if (!LeadingBlanks)
+          LeadingBlanks = 1;
+        Tmp = i;
+        Column = 0;
+        ++Line;
+      }
+    }
+
+    if (!FlowLevel && Column < indent)
+      break;
+
+    Current = Tmp;
+  }
+  if (Start == Current) {
+    setError("Got empty plain scalar", Start);
+    return false;
+  }
+  Token T;
+  T.Kind = Token::TK_Scalar;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+
+  // Plain scalars can be simple keys.
+  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+
+  IsSimpleKeyAllowed = false;
+
+  return true;
+}
+
+bool Scanner::scanAliasOrAnchor(bool IsAlias) {
+  StringRef::iterator Start = Current;
+  unsigned ColStart = Column;
+  skip(1);
+  while(true) {
+    if (   *Current == '[' || *Current == ']'
+        || *Current == '{' || *Current == '}'
+        || *Current == ','
+        || *Current == ':')
+      break;
+    StringRef::iterator i = skip_ns_char(Current);
+    if (i == Current)
+      break;
+    Current = i;
+    ++Column;
+  }
+
+  if (Start == Current) {
+    setError("Got empty alias or anchor", Start);
+    return false;
+  }
+
+  Token T;
+  T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+
+  // Alias and anchors can be simple keys.
+  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+
+  IsSimpleKeyAllowed = false;
+
+  return true;
+}
+
+bool Scanner::scanBlockScalar(bool IsLiteral) {
+  StringRef::iterator Start = Current;
+  skip(1); // Eat | or >
+  while(true) {
+    StringRef::iterator i = skip_nb_char(Current);
+    if (i == Current) {
+      if (Column == 0)
+        break;
+      i = skip_b_break(Current);
+      if (i != Current) {
+        // We got a line break.
+        Column = 0;
+        ++Line;
+        Current = i;
+        continue;
+      } else {
+        // There was an error, which should already have been printed out.
+        return false;
+      }
+    }
+    Current = i;
+    ++Column;
+  }
+
+  if (Start == Current) {
+    setError("Got empty block scalar", Start);
+    return false;
+  }
+
+  Token T;
+  T.Kind = Token::TK_Scalar;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanTag() {
+  StringRef::iterator Start = Current;
+  unsigned ColStart = Column;
+  skip(1); // Eat !.
+  if (Current == End || isBlankOrBreak(Current)); // An empty tag.
+  else if (*Current == '<') {
+    skip(1);
+    scan_ns_uri_char();
+    if (!consume('>'))
+      return false;
+  } else {
+    // FIXME: Actually parse the c-ns-shorthand-tag rule.
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+  }
+
+  Token T;
+  T.Kind = Token::TK_Tag;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+
+  // Tags can be simple keys.
+  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+
+  IsSimpleKeyAllowed = false;
+
+  return true;
+}
+
+bool Scanner::fetchMoreTokens() {
+  if (IsStartOfStream)
+    return scanStreamStart();
+
+  scanToNextToken();
+
+  if (Current == End)
+    return scanStreamEnd();
+
+  removeStaleSimpleKeyCandidates();
+
+  unrollIndent(Column);
+
+  if (Column == 0 && *Current == '%')
+    return scanDirective();
+
+  if (Column == 0 && Current + 4 <= End
+      && *Current == '-'
+      && *(Current + 1) == '-'
+      && *(Current + 2) == '-'
+      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
+    return scanDocumentIndicator(true);
+
+  if (Column == 0 && Current + 4 <= End
+      && *Current == '.'
+      && *(Current + 1) == '.'
+      && *(Current + 2) == '.'
+      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
+    return scanDocumentIndicator(false);
+
+  if (*Current == '[')
+    return scanFlowCollectionStart(true);
+
+  if (*Current == '{')
+    return scanFlowCollectionStart(false);
+
+  if (*Current == ']')
+    return scanFlowCollectionEnd(true);
+
+  if (*Current == '}')
+    return scanFlowCollectionEnd(false);
+
+  if (*Current == ',')
+    return scanFlowEntry();
+
+  if (*Current == '-' && isBlankOrBreak(Current + 1))
+    return scanBlockEntry();
+
+  if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
+    return scanKey();
+
+  if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
+    return scanValue();
+
+  if (*Current == '*')
+    return scanAliasOrAnchor(true);
+
+  if (*Current == '&')
+    return scanAliasOrAnchor(false);
+
+  if (*Current == '!')
+    return scanTag();
+
+  if (*Current == '|' && !FlowLevel)
+    return scanBlockScalar(true);
+
+  if (*Current == '>' && !FlowLevel)
+    return scanBlockScalar(false);
+
+  if (*Current == '\'')
+    return scanFlowScalar(false);
+
+  if (*Current == '"')
+    return scanFlowScalar(true);
+
+  // Get a plain scalar.
+  StringRef FirstChar(Current, 1);
+  if (!(isBlankOrBreak(Current)
+        || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
+      || (*Current == '-' && !isBlankOrBreak(Current + 1))
+      || (!FlowLevel && (*Current == '?' || *Current == ':')
+          && isBlankOrBreak(Current + 1))
+      || (!FlowLevel && *Current == ':'
+                      && Current + 2 < End
+                      && *(Current + 1) == ':'
+                      && !isBlankOrBreak(Current + 2)))
+    return scanPlainScalar();
+
+  setError("Unrecognized character while tokenizing.");
+  return false;
+}
+
+Stream::Stream(StringRef Input, SourceMgr &SM)
+  : scanner(new Scanner(Input, SM))
+  , CurrentDoc(0) {}
+
+Stream::~Stream() {}
+
+bool Stream::failed() { return scanner->failed(); }
+
+void Stream::printError(Node *N, const Twine &Msg) {
+  SmallVector<SMRange, 1> Ranges;
+  Ranges.push_back(N->getSourceRange());
+  scanner->printError( N->getSourceRange().Start
+                     , SourceMgr::DK_Error
+                     , Msg
+                     , Ranges);
+}
+
+void Stream::handleYAMLDirective(const Token &t) {
+  // TODO: Ensure version is 1.x.
+}
+
+document_iterator Stream::begin() {
+  if (CurrentDoc)
+    report_fatal_error("Can only iterate over the stream once");
+
+  // Skip Stream-Start.
+  scanner->getNext();
+
+  CurrentDoc.reset(new Document(*this));
+  return document_iterator(CurrentDoc);
+}
+
+document_iterator Stream::end() {
+  return document_iterator();
+}
+
+void Stream::skip() {
+  for (document_iterator i = begin(), e = end(); i != e; ++i)
+    i->skip();
+}
+
+Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A)
+  : Doc(D)
+  , TypeID(Type)
+  , Anchor(A) {
+  SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
+  SourceRange = SMRange(Start, Start);
+}
+
+Token &Node::peekNext() {
+  return Doc->peekNext();
+}
+
+Token Node::getNext() {
+  return Doc->getNext();
+}
+
+Node *Node::parseBlockNode() {
+  return Doc->parseBlockNode();
+}
+
+BumpPtrAllocator &Node::getAllocator() {
+  return Doc->NodeAllocator;
+}
+
+void Node::setError(const Twine &Msg, Token &Tok) const {
+  Doc->setError(Msg, Tok);
+}
+
+bool Node::failed() const {
+  return Doc->failed();
+}
+
+
+
+StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
+  // TODO: Handle newlines properly. We need to remove leading whitespace.
+  if (Value[0] == '"') { // Double quoted.
+    // Pull off the leading and trailing "s.
+    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
+    // Search for characters that would require unescaping the value.
+    StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
+    if (i != StringRef::npos)
+      return unescapeDoubleQuoted(UnquotedValue, i, Storage);
+    return UnquotedValue;
+  } else if (Value[0] == '\'') { // Single quoted.
+    // Pull off the leading and trailing 's.
+    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
+    StringRef::size_type i = UnquotedValue.find('\'');
+    if (i != StringRef::npos) {
+      // We're going to need Storage.
+      Storage.clear();
+      Storage.reserve(UnquotedValue.size());
+      for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
+        StringRef Valid(UnquotedValue.begin(), i);
+        Storage.insert(Storage.end(), Valid.begin(), Valid.end());
+        Storage.push_back('\'');
+        UnquotedValue = UnquotedValue.substr(i + 2);
+      }
+      Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
+      return StringRef(Storage.begin(), Storage.size());
+    }
+    return UnquotedValue;
+  }
+  // Plain or block.
+  size_t trimtrail = Value.rfind(' ');
+  return Value.drop_back(
+    trimtrail == StringRef::npos ? 0 : Value.size() - trimtrail);
+}
+
+StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
+                                          , StringRef::size_type i
+                                          , SmallVectorImpl<char> &Storage)
+                                          const {
+  // Use Storage to build proper value.
+  Storage.clear();
+  Storage.reserve(UnquotedValue.size());
+  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
+    // Insert all previous chars into Storage.
+    StringRef Valid(UnquotedValue.begin(), i);
+    Storage.insert(Storage.end(), Valid.begin(), Valid.end());
+    // Chop off inserted chars.
+    UnquotedValue = UnquotedValue.substr(i);
+
+    assert(!UnquotedValue.empty() && "Can't be empty!");
+
+    // Parse escape or line break.
+    switch (UnquotedValue[0]) {
+    case '\r':
+    case '\n':
+      Storage.push_back('\n');
+      if (   UnquotedValue.size() > 1
+          && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
+        UnquotedValue = UnquotedValue.substr(1);
+      UnquotedValue = UnquotedValue.substr(1);
+      break;
+    default:
+      if (UnquotedValue.size() == 1)
+        // TODO: Report error.
+        break;
+      UnquotedValue = UnquotedValue.substr(1);
+      switch (UnquotedValue[0]) {
+      default: {
+          Token T;
+          T.Range = StringRef(UnquotedValue.begin(), 1);
+          setError("Unrecognized escape code!", T);
+          return "";
+        }
+      case '\r':
+      case '\n':
+        // Remove the new line.
+        if (   UnquotedValue.size() > 1
+            && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
+          UnquotedValue = UnquotedValue.substr(1);
+        // If this was just a single byte newline, it will get skipped
+        // below.
+        break;
+      case '0':
+        Storage.push_back(0x00);
+        break;
+      case 'a':
+        Storage.push_back(0x07);
+        break;
+      case 'b':
+        Storage.push_back(0x08);
+        break;
+      case 't':
+      case 0x09:
+        Storage.push_back(0x09);
+        break;
+      case 'n':
+        Storage.push_back(0x0A);
+        break;
+      case 'v':
+        Storage.push_back(0x0B);
+        break;
+      case 'f':
+        Storage.push_back(0x0C);
+        break;
+      case 'r':
+        Storage.push_back(0x0D);
+        break;
+      case 'e':
+        Storage.push_back(0x1B);
+        break;
+      case ' ':
+        Storage.push_back(0x20);
+        break;
+      case '"':
+        Storage.push_back(0x22);
+        break;
+      case '/':
+        Storage.push_back(0x2F);
+        break;
+      case '\\':
+        Storage.push_back(0x5C);
+        break;
+      case 'N':
+        encodeUTF8(0x85, Storage);
+        break;
+      case '_':
+        encodeUTF8(0xA0, Storage);
+        break;
+      case 'L':
+        encodeUTF8(0x2028, Storage);
+        break;
+      case 'P':
+        encodeUTF8(0x2029, Storage);
+        break;
+      case 'x': {
+          if (UnquotedValue.size() < 3)
+            // TODO: Report error.
+            break;
+          unsigned int UnicodeScalarValue;
+          UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue);
+          encodeUTF8(UnicodeScalarValue, Storage);
+          UnquotedValue = UnquotedValue.substr(2);
+          break;
+        }
+      case 'u': {
+          if (UnquotedValue.size() < 5)
+            // TODO: Report error.
+            break;
+          unsigned int UnicodeScalarValue;
+          UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue);
+          encodeUTF8(UnicodeScalarValue, Storage);
+          UnquotedValue = UnquotedValue.substr(4);
+          break;
+        }
+      case 'U': {
+          if (UnquotedValue.size() < 9)
+            // TODO: Report error.
+            break;
+          unsigned int UnicodeScalarValue;
+          UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue);
+          encodeUTF8(UnicodeScalarValue, Storage);
+          UnquotedValue = UnquotedValue.substr(8);
+          break;
+        }
+      }
+      UnquotedValue = UnquotedValue.substr(1);
+    }
+  }
+  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
+  return StringRef(Storage.begin(), Storage.size());
+}
+
+Node *KeyValueNode::getKey() {
+  if (Key)
+    return Key;
+  // Handle implicit null keys.
+  {
+    Token &t = peekNext();
+    if (   t.Kind == Token::TK_BlockEnd
+        || t.Kind == Token::TK_Value
+        || t.Kind == Token::TK_Error) {
+      return Key = new (getAllocator()) NullNode(Doc);
+    }
+    if (t.Kind == Token::TK_Key)
+      getNext(); // skip TK_Key.
+  }
+
+  // Handle explicit null keys.
+  Token &t = peekNext();
+  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
+    return Key = new (getAllocator()) NullNode(Doc);
+  }
+
+  // We've got a normal key.
+  return Key = parseBlockNode();
+}
+
+Node *KeyValueNode::getValue() {
+  if (Value)
+    return Value;
+  getKey()->skip();
+  if (failed())
+    return Value = new (getAllocator()) NullNode(Doc);
+
+  // Handle implicit null values.
+  {
+    Token &t = peekNext();
+    if (   t.Kind == Token::TK_BlockEnd
+        || t.Kind == Token::TK_FlowMappingEnd
+        || t.Kind == Token::TK_Key
+        || t.Kind == Token::TK_FlowEntry
+        || t.Kind == Token::TK_Error) {
+      return Value = new (getAllocator()) NullNode(Doc);
+    }
+
+    if (t.Kind != Token::TK_Value) {
+      setError("Unexpected token in Key Value.", t);
+      return Value = new (getAllocator()) NullNode(Doc);
+    }
+    getNext(); // skip TK_Value.
+  }
+
+  // Handle explicit null values.
+  Token &t = peekNext();
+  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
+    return Value = new (getAllocator()) NullNode(Doc);
+  }
+
+  // We got a normal value.
+  return Value = parseBlockNode();
+}
+
+void MappingNode::increment() {
+  if (failed()) {
+    IsAtEnd = true;
+    CurrentEntry = 0;
+    return;
+  }
+  if (CurrentEntry) {
+    CurrentEntry->skip();
+    if (Type == MT_Inline) {
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      return;
+    }
+  }
+  Token T = peekNext();
+  if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
+    // KeyValueNode eats the TK_Key. That way it can detect null keys.
+    CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
+  } else if (Type == MT_Block) {
+    switch (T.Kind) {
+    case Token::TK_BlockEnd:
+      getNext();
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    default:
+      setError("Unexpected token. Expected Key or Block End", T);
+    case Token::TK_Error:
+      IsAtEnd = true;
+      CurrentEntry = 0;
+    }
+  } else {
+    switch (T.Kind) {
+    case Token::TK_FlowEntry:
+      // Eat the flow entry and recurse.
+      getNext();
+      return increment();
+    case Token::TK_FlowMappingEnd:
+      getNext();
+    case Token::TK_Error:
+      // Set this to end iterator.
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    default:
+      setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
+                "Mapping End."
+              , T);
+      IsAtEnd = true;
+      CurrentEntry = 0;
+    }
+  }
+}
+
+void SequenceNode::increment() {
+  if (failed()) {
+    IsAtEnd = true;
+    CurrentEntry = 0;
+    return;
+  }
+  if (CurrentEntry)
+    CurrentEntry->skip();
+  Token T = peekNext();
+  if (SeqType == ST_Block) {
+    switch (T.Kind) {
+    case Token::TK_BlockEntry:
+      getNext();
+      CurrentEntry = parseBlockNode();
+      if (CurrentEntry == 0) { // An error occurred.
+        IsAtEnd = true;
+        CurrentEntry = 0;
+      }
+      break;
+    case Token::TK_BlockEnd:
+      getNext();
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    default:
+      setError( "Unexpected token. Expected Block Entry or Block End."
+              , T);
+    case Token::TK_Error:
+      IsAtEnd = true;
+      CurrentEntry = 0;
+    }
+  } else if (SeqType == ST_Indentless) {
+    switch (T.Kind) {
+    case Token::TK_BlockEntry:
+      getNext();
+      CurrentEntry = parseBlockNode();
+      if (CurrentEntry == 0) { // An error occurred.
+        IsAtEnd = true;
+        CurrentEntry = 0;
+      }
+      break;
+    default:
+    case Token::TK_Error:
+      IsAtEnd = true;
+      CurrentEntry = 0;
+    }
+  } else if (SeqType == ST_Flow) {
+    switch (T.Kind) {
+    case Token::TK_FlowEntry:
+      // Eat the flow entry and recurse.
+      getNext();
+      WasPreviousTokenFlowEntry = true;
+      return increment();
+    case Token::TK_FlowSequenceEnd:
+      getNext();
+    case Token::TK_Error:
+      // Set this to end iterator.
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    case Token::TK_StreamEnd:
+    case Token::TK_DocumentEnd:
+    case Token::TK_DocumentStart:
+      setError("Could not find closing ]!", T);
+      // Set this to end iterator.
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    default:
+      if (!WasPreviousTokenFlowEntry) {
+        setError("Expected , between entries!", T);
+        IsAtEnd = true;
+        CurrentEntry = 0;
+        break;
+      }
+      // Otherwise it must be a flow entry.
+      CurrentEntry = parseBlockNode();
+      if (!CurrentEntry) {
+        IsAtEnd = true;
+      }
+      WasPreviousTokenFlowEntry = false;
+      break;
+    }
+  }
+}
+
+Document::Document(Stream &S) : stream(S), Root(0) {
+  if (parseDirectives())
+    expectToken(Token::TK_DocumentStart);
+  Token &T = peekNext();
+  if (T.Kind == Token::TK_DocumentStart)
+    getNext();
+}
+
+bool Document::skip()  {
+  if (stream.scanner->failed())
+    return false;
+  if (!Root)
+    getRoot();
+  Root->skip();
+  Token &T = peekNext();
+  if (T.Kind == Token::TK_StreamEnd)
+    return false;
+  if (T.Kind == Token::TK_DocumentEnd) {
+    getNext();
+    return skip();
+  }
+  return true;
+}
+
+Token &Document::peekNext() {
+  return stream.scanner->peekNext();
+}
+
+Token Document::getNext() {
+  return stream.scanner->getNext();
+}
+
+void Document::setError(const Twine &Message, Token &Location) const {
+  stream.scanner->setError(Message, Location.Range.begin());
+}
+
+bool Document::failed() const {
+  return stream.scanner->failed();
+}
+
+Node *Document::parseBlockNode() {
+  Token T = peekNext();
+  // Handle properties.
+  Token AnchorInfo;
+parse_property:
+  switch (T.Kind) {
+  case Token::TK_Alias:
+    getNext();
+    return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
+  case Token::TK_Anchor:
+    if (AnchorInfo.Kind == Token::TK_Anchor) {
+      setError("Already encountered an anchor for this node!", T);
+      return 0;
+    }
+    AnchorInfo = getNext(); // Consume TK_Anchor.
+    T = peekNext();
+    goto parse_property;
+  case Token::TK_Tag:
+    getNext(); // Skip TK_Tag.
+    T = peekNext();
+    goto parse_property;
+  default:
+    break;
+  }
+
+  switch (T.Kind) {
+  case Token::TK_BlockEntry:
+    // We got an unindented BlockEntry sequence. This is not terminated with
+    // a BlockEnd.
+    // Don't eat the TK_BlockEntry, SequenceNode needs it.
+    return new (NodeAllocator) SequenceNode( stream.CurrentDoc
+                                           , AnchorInfo.Range.substr(1)
+                                           , SequenceNode::ST_Indentless);
+  case Token::TK_BlockSequenceStart:
+    getNext();
+    return new (NodeAllocator)
+      SequenceNode( stream.CurrentDoc
+                  , AnchorInfo.Range.substr(1)
+                  , SequenceNode::ST_Block);
+  case Token::TK_BlockMappingStart:
+    getNext();
+    return new (NodeAllocator)
+      MappingNode( stream.CurrentDoc
+                 , AnchorInfo.Range.substr(1)
+                 , MappingNode::MT_Block);
+  case Token::TK_FlowSequenceStart:
+    getNext();
+    return new (NodeAllocator)
+      SequenceNode( stream.CurrentDoc
+                  , AnchorInfo.Range.substr(1)
+                  , SequenceNode::ST_Flow);
+  case Token::TK_FlowMappingStart:
+    getNext();
+    return new (NodeAllocator)
+      MappingNode( stream.CurrentDoc
+                 , AnchorInfo.Range.substr(1)
+                 , MappingNode::MT_Flow);
+  case Token::TK_Scalar:
+    getNext();
+    return new (NodeAllocator)
+      ScalarNode( stream.CurrentDoc
+                , AnchorInfo.Range.substr(1)
+                , T.Range);
+  case Token::TK_Key:
+    // Don't eat the TK_Key, KeyValueNode expects it.
+    return new (NodeAllocator)
+      MappingNode( stream.CurrentDoc
+                 , AnchorInfo.Range.substr(1)
+                 , MappingNode::MT_Inline);
+  case Token::TK_DocumentStart:
+  case Token::TK_DocumentEnd:
+  case Token::TK_StreamEnd:
+  default:
+    // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
+    //       !!null null.
+    return new (NodeAllocator) NullNode(stream.CurrentDoc);
+  case Token::TK_Error:
+    return 0;
+  }
+  llvm_unreachable("Control flow shouldn't reach here.");
+  return 0;
+}
+
+bool Document::parseDirectives() {
+  bool isDirective = false;
+  while (true) {
+    Token T = peekNext();
+    if (T.Kind == Token::TK_TagDirective) {
+      handleTagDirective(getNext());
+      isDirective = true;
+    } else if (T.Kind == Token::TK_VersionDirective) {
+      stream.handleYAMLDirective(getNext());
+      isDirective = true;
+    } else
+      break;
+  }
+  return isDirective;
+}
+
+bool Document::expectToken(int TK) {
+  Token T = getNext();
+  if (T.Kind != TK) {
+    setError("Unexpected token", T);
+    return false;
+  }
+  return true;
+}
+
+OwningPtr<Document> document_iterator::NullDoc;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 72d3986..86cdca1 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -633,6 +633,19 @@ raw_ostream &raw_fd_ostream::resetColor() {
   return *this;
 }
 
+raw_ostream &raw_fd_ostream::reverseColor() {
+  if (sys::Process::ColorNeedsFlush())
+    flush();
+  const char *colorcode = sys::Process::OutputReverse();
+  if (colorcode) {
+    size_t len = strlen(colorcode);
+    write(colorcode, len);
+    // don't account colors towards output characters
+    pos -= len;
+  }
+  return *this;
+}
+
 bool raw_fd_ostream::is_displayed() const {
   return sys::Process::FileDescriptorIsDisplayed(FD);
 }
diff --git a/lib/TableGen/Error.cpp b/lib/TableGen/Error.cpp
index 5071ee7..1463b68 100644
--- a/lib/TableGen/Error.cpp
+++ b/lib/TableGen/Error.cpp
@@ -20,6 +20,22 @@ namespace llvm {
 
 SourceMgr SrcMgr;
 
+void PrintWarning(SMLoc WarningLoc, const Twine &Msg) {
+  SrcMgr.PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg);
+}
+
+void PrintWarning(const char *Loc, const Twine &Msg) {
+  SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Warning, Msg);
+}
+
+void PrintWarning(const Twine &Msg) {
+  errs() << "warning:" << Msg << "\n";
+}
+
+void PrintWarning(const TGError &Warning) {
+  PrintWarning(Warning.getLoc(), Warning.getMessage());
+}
+
 void PrintError(SMLoc ErrorLoc, const Twine &Msg) {
   SrcMgr.PrintMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
 }
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index b05fe62..9b0cb0c 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -38,9 +38,6 @@ def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true",
 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
                                    "Enable NEON instructions",
                                    [FeatureVFP3]>;
-def FeatureNEON2 : SubtargetFeature<"neon2", "HasNEON2", "true",
-                                   "Enable Advanced SIMD2 instructions",
-                                   [FeatureNEON]>;
 def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
                                      "Enable Thumb2 instructions">;
 def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
@@ -76,8 +73,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
 def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
                                         "true",
                                         "Use NEON for single precision FP">;
-// Allow more precision in FP computation
-def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
 
 // Disable 32-bit to 16-bit narrowing for experimentation.
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index ca30716..410790a 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -16,7 +16,6 @@
 #include "ARMAsmPrinter.h"
 #include "ARM.h"
 #include "ARMBuildAttrs.h"
-#include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMTargetMachine.h"
@@ -35,7 +34,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCObjectStreamer.h"
@@ -44,8 +42,6 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -732,8 +728,9 @@ void ARMAsmPrinter::emitAttributes() {
   if (Subtarget->hasNEON() && emitFPU) {
     /* NEON is not exactly a VFP architecture, but GAS emit one of
      * neon/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
-    if (Subtarget->hasNEON2())
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon-vfpv4");
+    if (Subtarget->hasVFP4())
+      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                                     "neon-vfpv4");
     else
       AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon");
     /* If emitted for NEON, omit from VFP below, since you can have both
@@ -1270,7 +1267,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   // Darwin call instructions are just normal call instructions with different
   // clobber semantics (they clobber R9).
-  case ARM::BXr9_CALL:
   case ARM::BX_CALL: {
     {
       MCInst TmpInst;
@@ -1292,7 +1288,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
-  case ARM::tBXr9_CALL:
   case ARM::tBX_CALL: {
     {
       MCInst TmpInst;
@@ -1315,7 +1310,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
-  case ARM::BMOVPCRXr9_CALL:
   case ARM::BMOVPCRX_CALL: {
     {
       MCInst TmpInst;
@@ -1343,7 +1337,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
-  case ARM::BMOVPCBr9_CALL:
   case ARM::BMOVPCB_CALL: {
     {
       MCInst TmpInst;
@@ -1371,7 +1364,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
-  case ARM::t2BMOVPCBr9_CALL:
   case ARM::t2BMOVPCB_CALL: {
     {
       MCInst TmpInst;
@@ -1984,10 +1976,10 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     {
       MCInst TmpInst;
-      TmpInst.setOpcode(ARM::tLDRr);
+      TmpInst.setOpcode(ARM::tLDRi);
       TmpInst.addOperand(MCOperand::CreateReg(ARM::R7));
       TmpInst.addOperand(MCOperand::CreateReg(SrcReg));
-      TmpInst.addOperand(MCOperand::CreateReg(0));
+      TmpInst.addOperand(MCOperand::CreateImm(0));
       // Predicate.
       TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
       TmpInst.addOperand(MCOperand::CreateReg(0));
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 4b276c5..af3f75a 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -107,7 +107,7 @@ public:
     if (!Subtarget->isTargetDarwin())
       return 0;
     return Subtarget->isThumb() ?
-      llvm::ARM::DW_ISA_ARM_thumb : llvm::ARM::DW_ISA_ARM_arm;
+      ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm;
   }
 
   MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 366e2fa..c6280f8 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -13,10 +13,10 @@
 
 #include "ARMBaseInstrInfo.h"
 #include "ARM.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMHazardRecognizer.h"
 #include "ARMMachineFunctionInfo.h"
-#include "ARMRegisterInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
@@ -680,29 +680,51 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  // Generate instructions for VMOVQQ and VMOVQQQQ pseudos in place.
-  if (ARM::QQPRRegClass.contains(DestReg, SrcReg) ||
-      ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
+  // Handle register classes that require multiple instructions.
+  unsigned BeginIdx = 0;
+  unsigned SubRegs = 0;
+  unsigned Spacing = 1;
+
+  // Use VORRq when possible.
+  if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 2;
+  else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 4;
+  // Fall back to VMOVD.
+  else if (ARM::DPairRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2;
+  else if (ARM::DTripleRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3;
+  else if (ARM::DQuadRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4;
+
+  else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2, Spacing = 2;
+  else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3, Spacing = 2;
+  else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2;
+
+  if (Opc) {
     const TargetRegisterInfo *TRI = &getRegisterInfo();
-    assert(ARM::qsub_0 + 3 == ARM::qsub_3 && "Expected contiguous enum.");
-    unsigned EndSubReg = ARM::QQPRRegClass.contains(DestReg, SrcReg) ?
-      ARM::qsub_1 : ARM::qsub_3;
-    for (unsigned i = ARM::qsub_0, e = EndSubReg + 1; i != e; ++i) {
-      unsigned Dst = TRI->getSubReg(DestReg, i);
-      unsigned Src = TRI->getSubReg(SrcReg, i);
-      MachineInstrBuilder Mov =
-        AddDefaultPred(BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VORRq))
-                       .addReg(Dst, RegState::Define)
-                       .addReg(Src, getKillRegState(KillSrc))
-                       .addReg(Src, getKillRegState(KillSrc)));
-      if (i == EndSubReg) {
-        Mov->addRegisterDefined(DestReg, TRI);
-        if (KillSrc)
-          Mov->addRegisterKilled(SrcReg, TRI);
-      }
+    MachineInstrBuilder Mov;
+    for (unsigned i = 0; i != SubRegs; ++i) {
+      unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
+      unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
+      assert(Dst && Src && "Bad sub-register");
+      Mov = AddDefaultPred(BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
+                             .addReg(Src));
+      // VORR takes two source operands.
+      if (Opc == ARM::VORRq)
+        Mov.addReg(Src);
     }
+    // Add implicit super-register defs and kills to the last instruction.
+    Mov->addRegisterDefined(DestReg, TRI);
+    if (KillSrc)
+      Mov->addRegisterKilled(SrcReg, TRI);
     return;
   }
+
   llvm_unreachable("Impossible reg-to-reg copy");
 }
 
@@ -757,7 +779,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         llvm_unreachable("Unknown reg class!");
       break;
     case 16:
-      if (ARM::QPRRegClass.hasSubClassEq(RC)) {
+      if (ARM::DPairRegClass.hasSubClassEq(RC)) {
         // Use aligned spills if the stack can be realigned.
         if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
           AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
@@ -907,7 +929,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       llvm_unreachable("Unknown reg class!");
     break;
   case 16:
-    if (ARM::QPRRegClass.hasSubClassEq(RC)) {
+    if (ARM::DPairRegClass.hasSubClassEq(RC)) {
       if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
         AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
                      .addFrameIndex(FI).addImm(16)
@@ -1478,6 +1500,29 @@ int llvm::getMatchingCondBranchOpcode(int Opc) {
   llvm_unreachable("Unknown unconditional branch opcode!");
 }
 
+/// commuteInstruction - Handle commutable instructions.
+MachineInstr *
+ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  switch (MI->getOpcode()) {
+  case ARM::MOVCCr:
+  case ARM::t2MOVCCr: {
+    // MOVCC can be commuted by inverting the condition.
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
+    // MOVCC AL can't be inverted. Shouldn't happen.
+    if (CC == ARMCC::AL || PredReg != ARM::CPSR)
+      return NULL;
+    MI = TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+    if (!MI)
+      return NULL;
+    // After swapping the MOVCC operands, also invert the condition.
+    MI->getOperand(MI->findFirstPredOperandIdx())
+      .setImm(ARMCC::getOppositeCondition(CC));
+    return MI;
+  }
+  }
+  return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+}
 
 /// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether the
 /// instruction is encoded with an 'S' bit is determined by the optional CPSR
@@ -1916,6 +1961,25 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
   if (!MRI->hasOneNonDBGUse(Reg))
     return false;
 
+  const MCInstrDesc &DefMCID = DefMI->getDesc();
+  if (DefMCID.hasOptionalDef()) {
+    unsigned NumOps = DefMCID.getNumOperands();
+    const MachineOperand &MO = DefMI->getOperand(NumOps-1);
+    if (MO.getReg() == ARM::CPSR && !MO.isDead())
+      // If DefMI defines CPSR and it is not dead, it's obviously not safe
+      // to delete DefMI.
+      return false;
+  }
+
+  const MCInstrDesc &UseMCID = UseMI->getDesc();
+  if (UseMCID.hasOptionalDef()) {
+    unsigned NumOps = UseMCID.getNumOperands();
+    if (UseMI->getOperand(NumOps-1).getReg() == ARM::CPSR)
+      // If the instruction sets the flag, do not attempt this optimization
+      // since it may change the semantics of the code.
+      return false;
+  }
+
   unsigned UseOpc = UseMI->getOpcode();
   unsigned NewUseOpc = 0;
   uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm();
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 314e317..2fe8507 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -139,6 +139,8 @@ public:
 
   MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const;
 
+  MachineInstr *commuteInstruction(MachineInstr*, bool=false) const;
+
   virtual bool produceSameValue(const MachineInstr *MI0,
                                 const MachineInstr *MI1,
                                 const MachineRegisterInfo *MRI) const;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 291369f..3907f75 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -15,7 +15,6 @@
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMFrameLowering.h"
-#include "ARMInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 2b9c55d..0bd1c3e 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -17,7 +17,6 @@
 
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMRegisterInfo.h"
 #include "ARMSubtarget.h"
 #include "llvm/CallingConv.h"
 #include "llvm/CodeGen/CallingConvLower.h"
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index d33364b..b9a2512 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -9,10 +9,6 @@
 // This describes the calling conventions for ARM architecture.
 //===----------------------------------------------------------------------===//
 
-/// CCIfSubtarget - Match if the current subtarget has a feature F.
-class CCIfSubtarget<string F, CCAction A>:
-  CCIf<!strconcat("State.getTarget().getSubtarget<ARMSubtarget>().", F), A>;
-
 /// CCIfAlign - Match of the original alignment of the arg
 class CCIfAlign<string Align, CCAction A>:
   CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index e48d07a..bc681be 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -15,7 +15,7 @@
 #define DEBUG_TYPE "jit"
 #include "ARM.h"
 #include "ARMConstantPoolValue.h"
-#include "ARMInstrInfo.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMRelocations.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
@@ -46,7 +46,7 @@ namespace {
 
   class ARMCodeEmitter : public MachineFunctionPass {
     ARMJITInfo                *JTI;
-    const ARMInstrInfo        *II;
+    const ARMBaseInstrInfo    *II;
     const TargetData          *TD;
     const ARMSubtarget        *Subtarget;
     TargetMachine             &TM;
@@ -66,7 +66,7 @@ namespace {
   public:
     ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
       : MachineFunctionPass(ID), JTI(0),
-        II((const ARMInstrInfo *)tm.getInstrInfo()),
+        II((const ARMBaseInstrInfo *)tm.getInstrInfo()),
         TD(tm.getTargetData()), TM(tm),
         MCE(mce), MCPEs(0), MJTEs(0),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {}
@@ -383,9 +383,9 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
           MF.getTarget().getRelocationModel() != Reloc::Static) &&
          "JIT relocation model must be set to static or default!");
-  JTI = ((ARMTargetMachine &)MF.getTarget()).getJITInfo();
-  II = ((const ARMTargetMachine &)MF.getTarget()).getInstrInfo();
-  TD = ((const ARMTargetMachine &)MF.getTarget()).getTargetData();
+  JTI = ((ARMBaseTargetMachine &)MF.getTarget()).getJITInfo();
+  II = (const ARMBaseInstrInfo *)MF.getTarget().getInstrInfo();
+  TD = MF.getTarget().getTargetData();
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
   MCPEs = &MF.getConstantPool()->getConstants();
   MJTEs = 0;
@@ -917,9 +917,7 @@ void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
     emitMiscBranchInstruction(MI);
     break;
   case ARM::BX_CALL:
-  case ARM::BMOVPCRX_CALL:
-  case ARM::BXr9_CALL:
-  case ARM::BMOVPCRXr9_CALL: {
+  case ARM::BMOVPCRX_CALL: {
     // First emit mov lr, pc
     unsigned Binary = 0x01a0e00f;
     Binary |= II->getPredicate(&MI) << ARMII::CondShift;
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 2cdfd1e..fc35c7c 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -16,12 +16,12 @@
 #define DEBUG_TYPE "arm-cp-islands"
 #include "ARM.h"
 #include "ARMMachineFunctionInfo.h"
-#include "ARMInstrInfo.h"
 #include "Thumb2InstrInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
@@ -209,8 +209,9 @@ namespace {
       }
       /// getMaxDisp - Returns the maximum displacement supported by MI.
       /// Correct for unknown alignment.
+      /// Conservatively subtract 2 bytes to handle weird alignment effects.
       unsigned getMaxDisp() const {
-        return KnownAlignment ? MaxDisp : MaxDisp - 2;
+        return (KnownAlignment ? MaxDisp : MaxDisp - 2) - 2;
       }
     };
 
@@ -266,7 +267,7 @@ namespace {
 
     MachineFunction *MF;
     MachineConstantPool *MCP;
-    const ARMInstrInfo *TII;
+    const ARMBaseInstrInfo *TII;
     const ARMSubtarget *STI;
     ARMFunctionInfo *AFI;
     bool isThumb;
@@ -283,51 +284,52 @@ namespace {
     }
 
   private:
-    void DoInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
     unsigned getCPELogAlign(const MachineInstr *CPEMI);
-    void JumpTableFunctionScan();
-    void InitialFunctionScan(const std::vector<MachineInstr*> &CPEMIs);
-    MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI);
-    void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB);
-    void AdjustBBOffsetsAfter(MachineBasicBlock *BB);
-    bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI);
-    int LookForExistingCPEntry(CPUser& U, unsigned UserOffset);
-    bool LookForWater(CPUser&U, unsigned UserOffset, water_iterator &WaterIter);
-    void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset,
+    void scanFunctionJumpTables();
+    void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    bool findAvailableWater(CPUser&U, unsigned UserOffset,
+                            water_iterator &WaterIter);
+    void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
                         MachineBasicBlock *&NewMBB);
-    bool HandleConstantPoolUser(unsigned CPUserIndex);
-    void RemoveDeadCPEMI(MachineInstr *CPEMI);
-    bool RemoveUnusedCPEntries();
-    bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
-                      MachineInstr *CPEMI, unsigned Disp, bool NegOk,
-                      bool DoDump = false);
-    bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water,
+    bool handleConstantPoolUser(unsigned CPUserIndex);
+    void removeDeadCPEMI(MachineInstr *CPEMI);
+    bool removeUnusedCPEntries();
+    bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                          MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                          bool DoDump = false);
+    bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
                         CPUser &U, unsigned &Growth);
-    bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
-    bool FixUpImmediateBr(ImmBranch &Br);
-    bool FixUpConditionalBr(ImmBranch &Br);
-    bool FixUpUnconditionalBr(ImmBranch &Br);
-    bool UndoLRSpillRestore();
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+    bool fixupUnconditionalBr(ImmBranch &Br);
+    bool undoLRSpillRestore();
     bool mayOptimizeThumb2Instruction(const MachineInstr *MI) const;
-    bool OptimizeThumb2Instructions();
-    bool OptimizeThumb2Branches();
-    bool ReorderThumb2JumpTables();
-    bool OptimizeThumb2JumpTables();
-    MachineBasicBlock *AdjustJTTargetBlockForward(MachineBasicBlock *BB,
+    bool optimizeThumb2Instructions();
+    bool optimizeThumb2Branches();
+    bool reorderThumb2JumpTables();
+    bool optimizeThumb2JumpTables();
+    MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB,
                                                   MachineBasicBlock *JTBB);
 
-    void ComputeBlockSize(MachineBasicBlock *MBB);
-    unsigned GetOffsetOf(MachineInstr *MI) const;
-    unsigned GetUserOffset(CPUser&) const;
+    void computeBlockSize(MachineBasicBlock *MBB);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    unsigned getUserOffset(CPUser&) const;
     void dumpBBs();
     void verify();
 
-    bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
                          unsigned Disp, bool NegativeOK, bool IsSoImm = false);
-    bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
                          const CPUser &U) {
-      return OffsetIsInRange(UserOffset, TrialOffset,
+      return isOffsetInRange(UserOffset, TrialOffset,
                              U.getMaxDisp(), U.NegOk, U.IsSoImm);
     }
   };
@@ -345,11 +347,21 @@ void ARMConstantIslands::verify() {
     assert(BBInfo[MBBId].Offset % (1u << Align) == 0);
     assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
   }
+  DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
   for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
     CPUser &U = CPUsers[i];
-    unsigned UserOffset = GetUserOffset(U);
-    assert(CPEIsInRange(U.MI, UserOffset, U.CPEMI, U.getMaxDisp(), U.NegOk) &&
-           "Constant pool entry out of range!");
+    unsigned UserOffset = getUserOffset(U);
+    // Verify offset using the real max displacement without the safety
+    // adjustment.
+    if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, U.getMaxDisp()+2, U.NegOk,
+                         /* DoDump = */ true)) {
+      DEBUG(dbgs() << "OK\n");
+      continue;
+    }
+    DEBUG(dbgs() << "Out of range.\n");
+    dumpBBs();
+    DEBUG(MF->dump());
+    llvm_unreachable("Constant pool entry out of range!");
   }
 #endif
 }
@@ -382,7 +394,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
                << MCP->getConstants().size() << " CP entries, aligned to "
                << MCP->getConstantPoolAlignment() << " bytes *****\n");
 
-  TII = (const ARMInstrInfo*)MF->getTarget().getInstrInfo();
+  TII = (const ARMBaseInstrInfo*)MF->getTarget().getInstrInfo();
   AFI = MF->getInfo<ARMFunctionInfo>();
   STI = &MF->getTarget().getSubtarget<ARMSubtarget>();
 
@@ -392,6 +404,9 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
 
   HasFarJump = false;
 
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
   MF->RenumberBlocks();
@@ -400,8 +415,8 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // of the TB[BH] instructions.
   bool MadeChange = false;
   if (isThumb2 && AdjustJumpTableBlocks) {
-    JumpTableFunctionScan();
-    MadeChange |= ReorderThumb2JumpTables();
+    scanFunctionJumpTables();
+    MadeChange |= reorderThumb2JumpTables();
     // Data is out of date, so clear it. It'll be re-computed later.
     T2JumpTables.clear();
     // Blocks may have shifted around. Keep the numbering up to date.
@@ -419,7 +434,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // we put them all at the end of the function.
   std::vector<MachineInstr*> CPEMIs;
   if (!MCP->isEmpty())
-    DoInitialPlacement(CPEMIs);
+    doInitialPlacement(CPEMIs);
 
   /// The next UID to take is the first unused one.
   AFI->initPICLabelUId(CPEMIs.size());
@@ -427,13 +442,13 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // Do the initial scan of the function, building up information about the
   // sizes of each block, the location of all the water, and finding all of the
   // constant pool users.
-  InitialFunctionScan(CPEMIs);
+  initializeFunctionInfo(CPEMIs);
   CPEMIs.clear();
   DEBUG(dumpBBs());
 
 
   /// Remove dead constant pool entries.
-  MadeChange |= RemoveUnusedCPEntries();
+  MadeChange |= removeUnusedCPEntries();
 
   // Iteratively place constant pool entries and fix up branches until there
   // is no change.
@@ -442,7 +457,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
     DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
     bool CPChange = false;
     for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
-      CPChange |= HandleConstantPoolUser(i);
+      CPChange |= handleConstantPoolUser(i);
     if (CPChange && ++NoCPIters > 30)
       report_fatal_error("Constant Island pass failed to converge!");
     DEBUG(dumpBBs());
@@ -454,7 +469,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
     DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
     bool BRChange = false;
     for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
-      BRChange |= FixUpImmediateBr(ImmBranches[i]);
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
     if (BRChange && ++NoBRIters > 30)
       report_fatal_error("Branch Fix Up pass failed to converge!");
     DEBUG(dumpBBs());
@@ -466,7 +481,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
 
   // Shrink 32-bit Thumb2 branch, load, and store instructions.
   if (isThumb2 && !STI->prefers32BitThumb())
-    MadeChange |= OptimizeThumb2Instructions();
+    MadeChange |= optimizeThumb2Instructions();
 
   // After a while, this might be made debug-only, but it is not expensive.
   verify();
@@ -474,7 +489,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
   // undo the spill / restore of LR if possible.
   if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
-    MadeChange |= UndoLRSpillRestore();
+    MadeChange |= undoLRSpillRestore();
 
   // Save the mapping between original and cloned constpool entries.
   for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
@@ -497,10 +512,10 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   return MadeChange;
 }
 
-/// DoInitialPlacement - Perform the initial placement of the constant pool
+/// doInitialPlacement - Perform the initial placement of the constant pool
 /// entries.  To start with, we put them all at the end of the function.
 void
-ARMConstantIslands::DoInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   // Create the basic block to hold the CPE's.
   MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
   MF->push_back(BB);
@@ -610,10 +625,10 @@ unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
   return Log2_32(Align);
 }
 
-/// JumpTableFunctionScan - Do a scan of the function, building up
+/// scanFunctionJumpTables - Do a scan of the function, building up
 /// information about the sizes of each block and the locations of all
 /// the jump tables.
-void ARMConstantIslands::JumpTableFunctionScan() {
+void ARMConstantIslands::scanFunctionJumpTables() {
   for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock &MBB = *MBBI;
@@ -625,11 +640,11 @@ void ARMConstantIslands::JumpTableFunctionScan() {
   }
 }
 
-/// InitialFunctionScan - Do the initial scan of the function, building up
+/// initializeFunctionInfo - Do the initial scan of the function, building up
 /// information about the sizes of each block, the location of all the water,
 /// and finding all of the constant pool users.
 void ARMConstantIslands::
-InitialFunctionScan(const std::vector<MachineInstr*> &CPEMIs) {
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   BBInfo.clear();
   BBInfo.resize(MF->getNumBlockIDs());
 
@@ -638,14 +653,14 @@ InitialFunctionScan(const std::vector<MachineInstr*> &CPEMIs) {
   // alignment assumptions, as we don't know for sure the size of any
   // instructions in the inline assembly.
   for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    ComputeBlockSize(I);
+    computeBlockSize(I);
 
   // The known bits of the entry block offset are determined by the function
   // alignment.
   BBInfo.front().KnownBits = MF->getAlignment();
 
   // Compute block offsets and known bits.
-  AdjustBBOffsetsAfter(MF->begin());
+  adjustBBOffsetsAfter(MF->begin());
 
   // Now go back through the instructions and build up our data structures.
   for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
@@ -790,9 +805,9 @@ InitialFunctionScan(const std::vector<MachineInstr*> &CPEMIs) {
   }
 }
 
-/// ComputeBlockSize - Compute the size and some alignment information for MBB.
+/// computeBlockSize - Compute the size and some alignment information for MBB.
 /// This function updates BBInfo directly.
-void ARMConstantIslands::ComputeBlockSize(MachineBasicBlock *MBB) {
+void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
   BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
   BBI.Size = 0;
   BBI.Unalign = 0;
@@ -817,10 +832,10 @@ void ARMConstantIslands::ComputeBlockSize(MachineBasicBlock *MBB) {
   }
 }
 
-/// GetOffsetOf - Return the current offset of the specified machine instruction
+/// getOffsetOf - Return the current offset of the specified machine instruction
 /// from the start of the function.  This offset changes as stuff is moved
 /// around inside the function.
-unsigned ARMConstantIslands::GetOffsetOf(MachineInstr *MI) const {
+unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const {
   MachineBasicBlock *MBB = MI->getParent();
 
   // The offset is composed of two things: the sum of the sizes of all MBB's
@@ -843,10 +858,10 @@ static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
   return LHS->getNumber() < RHS->getNumber();
 }
 
-/// UpdateForInsertedWaterBlock - When a block is newly inserted into the
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
 /// machine function, it upsets all of the block numbers.  Renumber the blocks
 /// and update the arrays that parallel this numbering.
-void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
+void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
   // Renumber the MBB's to keep them consecutive.
   NewBB->getParent()->RenumberBlocks(NewBB);
 
@@ -866,7 +881,7 @@ void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
 /// Split the basic block containing MI into two blocks, which are joined by
 /// an unconditional branch.  Update data structures and renumber blocks to
 /// account for this change and returns the newly created block.
-MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
+MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
   MachineBasicBlock *OrigBB = MI->getParent();
 
   // Create a new MBB for the code after the OrigBB.
@@ -897,7 +912,7 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   OrigBB->addSuccessor(NewBB);
 
   // Update internal data structures to account for the newly inserted MBB.
-  // This is almost the same as UpdateForInsertedWaterBlock, except that
+  // This is almost the same as updateForInsertedWaterBlock, except that
   // the Water goes after OrigBB, not NewBB.
   MF->RenumberBlocks(NewBB);
 
@@ -924,23 +939,23 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   // the new jump we added.  (It should be possible to do this without
   // recounting everything, but it's very confusing, and this is rarely
   // executed.)
-  ComputeBlockSize(OrigBB);
+  computeBlockSize(OrigBB);
 
   // Figure out how large the NewMBB is.  As the second half of the original
   // block, it may contain a tablejump.
-  ComputeBlockSize(NewBB);
+  computeBlockSize(NewBB);
 
   // All BBOffsets following these blocks must be modified.
-  AdjustBBOffsetsAfter(OrigBB);
+  adjustBBOffsetsAfter(OrigBB);
 
   return NewBB;
 }
 
-/// GetUserOffset - Compute the offset of U.MI as seen by the hardware
+/// getUserOffset - Compute the offset of U.MI as seen by the hardware
 /// displacement computation.  Update U.KnownAlignment to match its current
 /// basic block location.
-unsigned ARMConstantIslands::GetUserOffset(CPUser &U) const {
-  unsigned UserOffset = GetOffsetOf(U.MI);
+unsigned ARMConstantIslands::getUserOffset(CPUser &U) const {
+  unsigned UserOffset = getOffsetOf(U.MI);
   const BasicBlockInfo &BBI = BBInfo[U.MI->getParent()->getNumber()];
   unsigned KnownBits = BBI.internalKnownBits();
 
@@ -960,13 +975,13 @@ unsigned ARMConstantIslands::GetUserOffset(CPUser &U) const {
   return UserOffset;
 }
 
-/// OffsetIsInRange - Checks whether UserOffset (the location of a constant pool
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
 /// reference) is within MaxDisp of TrialOffset (a proposed location of a
 /// constant pool entry).
-/// UserOffset is computed by GetUserOffset above to include PC adjustments. If
+/// UserOffset is computed by getUserOffset above to include PC adjustments. If
 /// the mod 4 alignment of UserOffset is not known, the uncertainty must be
 /// subtracted from MaxDisp instead. CPUser::getMaxDisp() does that.
-bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset,
+bool ARMConstantIslands::isOffsetInRange(unsigned UserOffset,
                                          unsigned TrialOffset, unsigned MaxDisp,
                                          bool NegativeOK, bool IsSoImm) {
   if (UserOffset <= TrialOffset) {
@@ -982,11 +997,11 @@ bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset,
   return false;
 }
 
-/// WaterIsInRange - Returns true if a CPE placed after the specified
+/// isWaterInRange - Returns true if a CPE placed after the specified
 /// Water (a basic block) will be in range for the specific MI.
 ///
 /// Compute how much the function will grow by inserting a CPE after Water.
-bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset,
+bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
                                         MachineBasicBlock* Water, CPUser &U,
                                         unsigned &Growth) {
   unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
@@ -1013,7 +1028,7 @@ bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset,
     Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
 
     // If the CPE is to be inserted before the instruction, that will raise
-    // the offset of the instruction.  Also account for unknown alignment padding
+    // the offset of the instruction. Also account for unknown alignment padding
     // in blocks between CPE and the user.
     if (CPEOffset < UserOffset)
       UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign);
@@ -1021,15 +1036,15 @@ bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset,
     // CPE fits in existing padding.
     Growth = 0;
 
-  return OffsetIsInRange(UserOffset, CPEOffset, U);
+  return isOffsetInRange(UserOffset, CPEOffset, U);
 }
 
-/// CPEIsInRange - Returns true if the distance between specific MI and
+/// isCPEntryInRange - Returns true if the distance between specific MI and
 /// specific ConstPool entry instruction can fit in MI's displacement field.
-bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
+bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
                                       MachineInstr *CPEMI, unsigned MaxDisp,
                                       bool NegOk, bool DoDump) {
-  unsigned CPEOffset  = GetOffsetOf(CPEMI);
+  unsigned CPEOffset  = getOffsetOf(CPEMI);
   assert(CPEOffset % 4 == 0 && "Misaligned CPE");
 
   if (DoDump) {
@@ -1046,7 +1061,7 @@ bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
     });
   }
 
-  return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
 }
 
 #ifndef NDEBUG
@@ -1066,7 +1081,7 @@ static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
 }
 #endif // NDEBUG
 
-void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB) {
+void ARMConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
   unsigned BBNum = BB->getNumber();
   for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
     // Get the offset and known bits at the end of the layout predecessor.
@@ -1088,17 +1103,18 @@ void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB) {
   }
 }
 
-/// DecrementOldEntry - find the constant pool entry with index CPI
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
 /// and instruction CPEMI, and decrement its refcount.  If the refcount
 /// becomes 0 remove the entry and instruction.  Returns true if we removed
 /// the entry, false if we didn't.
 
-bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) {
+bool ARMConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                    MachineInstr *CPEMI) {
   // Find the old entry. Eliminate it if it is no longer used.
   CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
   assert(CPE && "Unexpected!");
   if (--CPE->RefCount == 0) {
-    RemoveDeadCPEMI(CPEMI);
+    removeDeadCPEMI(CPEMI);
     CPE->CPEMI = NULL;
     --NumCPEs;
     return true;
@@ -1112,13 +1128,14 @@ bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) {
 /// 0 = no existing entry found
 /// 1 = entry found, and there were no code insertions or deletions
 /// 2 = entry found, and there were code insertions or deletions
-int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
+int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
 {
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
 
   // Check to see if the CPE is already in-range.
-  if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk, true)) {
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
     DEBUG(dbgs() << "In range\n");
     return 1;
   }
@@ -1133,7 +1150,7 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
     // Removing CPEs can leave empty entries, skip
     if (CPEs[i].CPEMI == NULL)
       continue;
-    if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
                      U.NegOk)) {
       DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
                    << CPEs[i].CPI << "\n");
@@ -1149,7 +1166,7 @@ int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
       CPEs[i].RefCount++;
       // ...and the original.  If we didn't remove the old entry, none of the
       // addresses changed, so we don't need another pass.
-      return DecrementOldEntry(CPI, CPEMI) ? 2 : 1;
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
     }
   }
   return 0;
@@ -1170,7 +1187,7 @@ static inline unsigned getUnconditionalBrDisp(int Opc) {
   return ((1<<23)-1)*4;
 }
 
-/// LookForWater - Look for an existing entry in the WaterList in which
+/// findAvailableWater - Look for an existing entry in the WaterList in which
 /// we can place the CPE referenced from U so it's within range of U's MI.
 /// Returns true if found, false if not.  If it returns true, WaterIter
 /// is set to the WaterList entry.  For Thumb, prefer water that will not
@@ -1178,7 +1195,7 @@ static inline unsigned getUnconditionalBrDisp(int Opc) {
 /// terminates, the CPE location for a particular CPUser is only allowed to
 /// move to a lower address, so search backward from the end of the list and
 /// prefer the first water that is in range.
-bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
+bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
                                       water_iterator &WaterIter) {
   if (WaterList.empty())
     return false;
@@ -1196,7 +1213,7 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
     // sure to take advantage of it for all the CPEs near that block, so that
     // we don't insert more branches than necessary.
     unsigned Growth;
-    if (WaterIsInRange(UserOffset, WaterBB, U, Growth) &&
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
         (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
          NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
       // This is the least amount of required padding seen so far.
@@ -1215,14 +1232,14 @@ bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
   return BestGrowth != ~0u;
 }
 
-/// CreateNewWater - No existing WaterList entry will work for
+/// createNewWater - No existing WaterList entry will work for
 /// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
 /// block is used if in range, and the conditional branch munged so control
 /// flow is correct.  Otherwise the block is split to create a hole with an
 /// unconditional branch around it.  In either case NewMBB is set to a
 /// block following which the new island can be inserted (the WaterList
 /// is not adjusted).
-void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
+void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
                                         unsigned UserOffset,
                                         MachineBasicBlock *&NewMBB) {
   CPUser &U = CPUsers[CPUserIndex];
@@ -1245,7 +1262,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
     unsigned CPEOffset = WorstCaseAlign(UserBlockEnd, CPELogAlign,
                                         UserBBI.postKnownBits());
 
-    if (OffsetIsInRange(UserOffset, CPEOffset, U)) {
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
       DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
             << format(", expected CPE offset %#x\n", CPEOffset));
       NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
@@ -1264,7 +1281,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
       ImmBranches.push_back(ImmBranch(&UserMBB->back(),
                                       MaxDisp, false, UncondBr));
       BBInfo[UserMBB->getNumber()].Size += Delta;
-      AdjustBBOffsetsAfter(UserMBB);
+      adjustBBOffsetsAfter(UserMBB);
       return;
     }
   }
@@ -1298,7 +1315,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
 
   // The 4 in the following is for the unconditional branch we'll be inserting
   // (allows for long branch on Thumb1).  Alignment of the island is handled
-  // inside OffsetIsInRange.
+  // inside isOffsetInRange.
   BaseInsertOffset -= 4;
 
   DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
@@ -1327,7 +1344,7 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
        MI = llvm::next(MI)) {
     if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
       CPUser &U = CPUsers[CPUIndex];
-      if (!OffsetIsInRange(Offset, EndInsertOffset, U)) {
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
         // Shift intertion point by one unit of alignment so it is within reach.
         BaseInsertOffset -= 1u << LogAlign;
         EndInsertOffset  -= 1u << LogAlign;
@@ -1352,29 +1369,29 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
   // Avoid splitting an IT block.
   if (LastIT) {
     unsigned PredReg = 0;
-    ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg);
+    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
     if (CC != ARMCC::AL)
       MI = LastIT;
   }
-  NewMBB = SplitBlockBeforeInstr(MI);
+  NewMBB = splitBlockBeforeInstr(MI);
 }
 
-/// HandleConstantPoolUser - Analyze the specified user, checking to see if it
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
 /// is out-of-range.  If so, pick up the constant pool value and move it some
 /// place in-range.  Return true if we changed any addresses (thus must run
 /// another pass of branch lengthening), false otherwise.
-bool ARMConstantIslands::HandleConstantPoolUser(unsigned CPUserIndex) {
+bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   CPUser &U = CPUsers[CPUserIndex];
   MachineInstr *UserMI = U.MI;
   MachineInstr *CPEMI  = U.CPEMI;
   unsigned CPI = CPEMI->getOperand(1).getIndex();
   unsigned Size = CPEMI->getOperand(2).getImm();
   // Compute this only once, it's expensive.
-  unsigned UserOffset = GetUserOffset(U);
+  unsigned UserOffset = getUserOffset(U);
 
   // See if the current entry is within range, or there is a clone of it
   // in range.
-  int result = LookForExistingCPEntry(U, UserOffset);
+  int result = findInRangeCPEntry(U, UserOffset);
   if (result==1) return false;
   else if (result==2) return true;
 
@@ -1386,7 +1403,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(unsigned CPUserIndex) {
   MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
   MachineBasicBlock *NewMBB;
   water_iterator IP;
-  if (LookForWater(U, UserOffset, IP)) {
+  if (findAvailableWater(U, UserOffset, IP)) {
     DEBUG(dbgs() << "Found water in range\n");
     MachineBasicBlock *WaterBB = *IP;
 
@@ -1403,9 +1420,9 @@ bool ARMConstantIslands::HandleConstantPoolUser(unsigned CPUserIndex) {
   } else {
     // No water found.
     DEBUG(dbgs() << "No water found\n");
-    CreateNewWater(CPUserIndex, UserOffset, NewMBB);
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
 
-    // SplitBlockBeforeInstr adds to WaterList, which is important when it is
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
     // called while handling branches so that the water will be seen on the
     // next iteration for constant pools, but in this context, we don't want
     // it.  Check for this so it will be removed from the WaterList.
@@ -1430,10 +1447,10 @@ bool ARMConstantIslands::HandleConstantPoolUser(unsigned CPUserIndex) {
   MF->insert(NewMBB, NewIsland);
 
   // Update internal data structures to account for the newly inserted MBB.
-  UpdateForInsertedWaterBlock(NewIsland);
+  updateForInsertedWaterBlock(NewIsland);
 
   // Decrement the old entry, and remove it if refcount becomes 0.
-  DecrementOldEntry(CPI, CPEMI);
+  decrementCPEReferenceCount(CPI, CPEMI);
 
   // Now that we have an island to add the CPE to, clone the original CPE and
   // add it to the island.
@@ -1448,7 +1465,7 @@ bool ARMConstantIslands::HandleConstantPoolUser(unsigned CPUserIndex) {
 
   // Increase the size of the island block to account for the new entry.
   BBInfo[NewIsland->getNumber()].Size += Size;
-  AdjustBBOffsetsAfter(llvm::prior(MachineFunction::iterator(NewIsland)));
+  adjustBBOffsetsAfter(llvm::prior(MachineFunction::iterator(NewIsland)));
 
   // Finally, change the CPI in the instruction operand to be ID.
   for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
@@ -1463,9 +1480,9 @@ bool ARMConstantIslands::HandleConstantPoolUser(unsigned CPUserIndex) {
   return true;
 }
 
-/// RemoveDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
 /// sizes and offsets of impacted basic blocks.
-void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) {
+void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
   MachineBasicBlock *CPEBB = CPEMI->getParent();
   unsigned Size = CPEMI->getOperand(2).getImm();
   CPEMI->eraseFromParent();
@@ -1480,7 +1497,7 @@ void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) {
     // Entries are sorted by descending alignment, so realign from the front.
     CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
 
-  AdjustBBOffsetsAfter(CPEBB);
+  adjustBBOffsetsAfter(CPEBB);
   // An island has only one predecessor BB and one successor BB. Check if
   // this BB's predecessor jumps directly to this BB's successor. This
   // shouldn't happen currently.
@@ -1488,15 +1505,15 @@ void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) {
   // FIXME: remove the empty blocks after all the work is done?
 }
 
-/// RemoveUnusedCPEntries - Remove constant pool entries whose refcounts
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
 /// are zero.
-bool ARMConstantIslands::RemoveUnusedCPEntries() {
+bool ARMConstantIslands::removeUnusedCPEntries() {
   unsigned MadeChange = false;
   for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
       std::vector<CPEntry> &CPEs = CPEntries[i];
       for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
         if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
-          RemoveDeadCPEMI(CPEs[j].CPEMI);
+          removeDeadCPEMI(CPEs[j].CPEMI);
           CPEs[j].CPEMI = NULL;
           MadeChange = true;
         }
@@ -1505,18 +1522,18 @@ bool ARMConstantIslands::RemoveUnusedCPEntries() {
   return MadeChange;
 }
 
-/// BBIsInRange - Returns true if the distance between specific MI and
+/// isBBInRange - Returns true if the distance between specific MI and
 /// specific BB can fit in MI's displacement field.
-bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
+bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
                                      unsigned MaxDisp) {
   unsigned PCAdj      = isThumb ? 4 : 8;
-  unsigned BrOffset   = GetOffsetOf(MI) + PCAdj;
+  unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
   unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
 
   DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
                << " from BB#" << MI->getParent()->getNumber()
                << " max delta=" << MaxDisp
-               << " from " << GetOffsetOf(MI) << " to " << DestOffset
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
                << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
 
   if (BrOffset <= DestOffset) {
@@ -1530,37 +1547,37 @@ bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
   return false;
 }
 
-/// FixUpImmediateBr - Fix up an immediate branch whose destination is too far
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
 /// away to fit in its displacement field.
-bool ARMConstantIslands::FixUpImmediateBr(ImmBranch &Br) {
+bool ARMConstantIslands::fixupImmediateBr(ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
 
   // Check to see if the DestBB is already in-range.
-  if (BBIsInRange(MI, DestBB, Br.MaxDisp))
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
     return false;
 
   if (!Br.isCond)
-    return FixUpUnconditionalBr(Br);
-  return FixUpConditionalBr(Br);
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
 }
 
-/// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
 /// too far away to fit in its displacement field. If the LR register has been
 /// spilled in the epilogue, then we can use BL to implement a far jump.
 /// Otherwise, add an intermediate branch instruction to a branch.
 bool
-ARMConstantIslands::FixUpUnconditionalBr(ImmBranch &Br) {
+ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *MBB = MI->getParent();
   if (!isThumb1)
-    llvm_unreachable("FixUpUnconditionalBr is Thumb1 only!");
+    llvm_unreachable("fixupUnconditionalBr is Thumb1 only!");
 
   // Use BL to implement far jump.
   Br.MaxDisp = (1 << 21) * 2;
   MI->setDesc(TII->get(ARM::tBfar));
   BBInfo[MBB->getNumber()].Size += 2;
-  AdjustBBOffsetsAfter(MBB);
+  adjustBBOffsetsAfter(MBB);
   HasFarJump = true;
   ++NumUBrFixed;
 
@@ -1569,11 +1586,11 @@ ARMConstantIslands::FixUpUnconditionalBr(ImmBranch &Br) {
   return true;
 }
 
-/// FixUpConditionalBr - Fix up a conditional branch whose destination is too
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
 /// far away to fit in its displacement field. It is converted to an inverse
 /// conditional branch + an unconditional branch to the destination.
 bool
-ARMConstantIslands::FixUpConditionalBr(ImmBranch &Br) {
+ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   MachineInstr *MI = Br.MI;
   MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
 
@@ -1607,7 +1624,7 @@ ARMConstantIslands::FixUpConditionalBr(ImmBranch &Br) {
       // bne L2
       // b   L1
       MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
-      if (BBIsInRange(MI, NewDest, Br.MaxDisp)) {
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
         DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
                      << *BMI);
         BMI->getOperand(0).setMBB(DestBB);
@@ -1619,7 +1636,7 @@ ARMConstantIslands::FixUpConditionalBr(ImmBranch &Br) {
   }
 
   if (NeedSplit) {
-    SplitBlockBeforeInstr(MI);
+    splitBlockBeforeInstr(MI);
     // No need for the branch to the next block. We're adding an unconditional
     // branch to the destination.
     int delta = TII->GetInstSizeInBytes(&MBB->back());
@@ -1651,14 +1668,14 @@ ARMConstantIslands::FixUpConditionalBr(ImmBranch &Br) {
   // Remove the old conditional branch.  It may or may not still be in MBB.
   BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
   MI->eraseFromParent();
-  AdjustBBOffsetsAfter(MBB);
+  adjustBBOffsetsAfter(MBB);
   return true;
 }
 
-/// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills
+/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills
 /// LR / restores LR to pc. FIXME: This is done here because it's only possible
 /// to do this if tBfar is not used.
-bool ARMConstantIslands::UndoLRSpillRestore() {
+bool ARMConstantIslands::undoLRSpillRestore() {
   bool MadeChange = false;
   for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
     MachineInstr *MI = PushPopMIs[i];
@@ -1677,26 +1694,26 @@ bool ARMConstantIslands::UndoLRSpillRestore() {
   return MadeChange;
 }
 
-// mayOptimizeThumb2Instruction - Returns true if OptimizeThumb2Instructions
+// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions
 // below may shrink MI.
 bool
 ARMConstantIslands::mayOptimizeThumb2Instruction(const MachineInstr *MI) const {
   switch(MI->getOpcode()) {
-    // OptimizeThumb2Instructions.
+    // optimizeThumb2Instructions.
     case ARM::t2LEApcrel:
     case ARM::t2LDRpci:
-    // OptimizeThumb2Branches.
+    // optimizeThumb2Branches.
     case ARM::t2B:
     case ARM::t2Bcc:
     case ARM::tBcc:
-    // OptimizeThumb2JumpTables.
+    // optimizeThumb2JumpTables.
     case ARM::t2BR_JT:
       return true;
   }
   return false;
 }
 
-bool ARMConstantIslands::OptimizeThumb2Instructions() {
+bool ARMConstantIslands::optimizeThumb2Instructions() {
   bool MadeChange = false;
 
   // Shrink ADR and LDR from constantpool.
@@ -1727,7 +1744,7 @@ bool ARMConstantIslands::OptimizeThumb2Instructions() {
     if (!NewOpc)
       continue;
 
-    unsigned UserOffset = GetUserOffset(U);
+    unsigned UserOffset = getUserOffset(U);
     unsigned MaxOffs = ((1 << Bits) - 1) * Scale;
 
     // Be conservative with inline asm.
@@ -1735,22 +1752,23 @@ bool ARMConstantIslands::OptimizeThumb2Instructions() {
       MaxOffs -= 2;
 
     // FIXME: Check if offset is multiple of scale if scale is not 4.
-    if (CPEIsInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
+    if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
+      DEBUG(dbgs() << "Shrink: " << *U.MI);
       U.MI->setDesc(TII->get(NewOpc));
       MachineBasicBlock *MBB = U.MI->getParent();
       BBInfo[MBB->getNumber()].Size -= 2;
-      AdjustBBOffsetsAfter(MBB);
+      adjustBBOffsetsAfter(MBB);
       ++NumT2CPShrunk;
       MadeChange = true;
     }
   }
 
-  MadeChange |= OptimizeThumb2Branches();
-  MadeChange |= OptimizeThumb2JumpTables();
+  MadeChange |= optimizeThumb2Branches();
+  MadeChange |= optimizeThumb2JumpTables();
   return MadeChange;
 }
 
-bool ARMConstantIslands::OptimizeThumb2Branches() {
+bool ARMConstantIslands::optimizeThumb2Branches() {
   bool MadeChange = false;
 
   for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) {
@@ -1776,11 +1794,12 @@ bool ARMConstantIslands::OptimizeThumb2Branches() {
     if (NewOpc) {
       unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
       MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
-      if (BBIsInRange(Br.MI, DestBB, MaxOffs)) {
+      if (isBBInRange(Br.MI, DestBB, MaxOffs)) {
+        DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
         Br.MI->setDesc(TII->get(NewOpc));
         MachineBasicBlock *MBB = Br.MI->getParent();
         BBInfo[MBB->getNumber()].Size -= 2;
-        AdjustBBOffsetsAfter(MBB);
+        adjustBBOffsetsAfter(MBB);
         ++NumT2BrShrunk;
         MadeChange = true;
       }
@@ -1797,7 +1816,7 @@ bool ARMConstantIslands::OptimizeThumb2Branches() {
 
     NewOpc = 0;
     unsigned PredReg = 0;
-    ARMCC::CondCodes Pred = llvm::getInstrPredicate(Br.MI, PredReg);
+    ARMCC::CondCodes Pred = getInstrPredicate(Br.MI, PredReg);
     if (Pred == ARMCC::EQ)
       NewOpc = ARM::tCBZ;
     else if (Pred == ARMCC::NE)
@@ -1807,7 +1826,7 @@ bool ARMConstantIslands::OptimizeThumb2Branches() {
     MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
     // Check if the distance is within 126. Subtract starting offset by 2
     // because the cmp will be eliminated.
-    unsigned BrOffset = GetOffsetOf(Br.MI) + 4 - 2;
+    unsigned BrOffset = getOffsetOf(Br.MI) + 4 - 2;
     unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
     if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
       MachineBasicBlock::iterator CmpMI = Br.MI;
@@ -1815,11 +1834,12 @@ bool ARMConstantIslands::OptimizeThumb2Branches() {
         --CmpMI;
         if (CmpMI->getOpcode() == ARM::tCMPi8) {
           unsigned Reg = CmpMI->getOperand(0).getReg();
-          Pred = llvm::getInstrPredicate(CmpMI, PredReg);
+          Pred = getInstrPredicate(CmpMI, PredReg);
           if (Pred == ARMCC::AL &&
               CmpMI->getOperand(1).getImm() == 0 &&
               isARMLowRegister(Reg)) {
             MachineBasicBlock *MBB = Br.MI->getParent();
+            DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
             MachineInstr *NewBR =
               BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
               .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags());
@@ -1827,7 +1847,7 @@ bool ARMConstantIslands::OptimizeThumb2Branches() {
             Br.MI->eraseFromParent();
             Br.MI = NewBR;
             BBInfo[MBB->getNumber()].Size -= 2;
-            AdjustBBOffsetsAfter(MBB);
+            adjustBBOffsetsAfter(MBB);
             ++NumCBZ;
             MadeChange = true;
           }
@@ -1839,9 +1859,9 @@ bool ARMConstantIslands::OptimizeThumb2Branches() {
   return MadeChange;
 }
 
-/// OptimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
+/// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
 /// jumptables when it's possible.
-bool ARMConstantIslands::OptimizeThumb2JumpTables() {
+bool ARMConstantIslands::optimizeThumb2JumpTables() {
   bool MadeChange = false;
 
   // FIXME: After the tables are shrunk, can we get rid some of the
@@ -1861,7 +1881,7 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables() {
 
     bool ByteOk = true;
     bool HalfWordOk = true;
-    unsigned JTOffset = GetOffsetOf(MI) + 4;
+    unsigned JTOffset = getOffsetOf(MI) + 4;
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
     for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
       MachineBasicBlock *MBB = JTBBs[j];
@@ -1936,11 +1956,14 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables() {
       if (!OptOk)
         continue;
 
+      DEBUG(dbgs() << "Shrink JT: " << *MI << "     addr: " << *AddrMI
+                   << "      lea: " << *LeaMI);
       unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
       MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc))
         .addReg(IdxReg, getKillRegState(IdxRegKill))
         .addJumpTableIndex(JTI, JTOP.getTargetFlags())
         .addImm(MI->getOperand(JTOpIdx+1).getImm());
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": " << *NewJTMI);
       // FIXME: Insert an "ALIGN" instruction to ensure the next instruction
       // is 2-byte aligned. For now, asm printer will fix it up.
       unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
@@ -1954,7 +1977,7 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables() {
 
       int delta = OrigSize - NewSize;
       BBInfo[MBB->getNumber()].Size -= delta;
-      AdjustBBOffsetsAfter(MBB);
+      adjustBBOffsetsAfter(MBB);
 
       ++NumTBs;
       MadeChange = true;
@@ -1964,9 +1987,9 @@ bool ARMConstantIslands::OptimizeThumb2JumpTables() {
   return MadeChange;
 }
 
-/// ReorderThumb2JumpTables - Adjust the function's block layout to ensure that
+/// reorderThumb2JumpTables - Adjust the function's block layout to ensure that
 /// jump tables always branch forwards, since that's what tbb and tbh need.
-bool ARMConstantIslands::ReorderThumb2JumpTables() {
+bool ARMConstantIslands::reorderThumb2JumpTables() {
   bool MadeChange = false;
 
   MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
@@ -1995,7 +2018,7 @@ bool ARMConstantIslands::ReorderThumb2JumpTables() {
         // The destination precedes the switch. Try to move the block forward
         // so we have a positive offset.
         MachineBasicBlock *NewBB =
-          AdjustJTTargetBlockForward(MBB, MI->getParent());
+          adjustJTTargetBlockForward(MBB, MI->getParent());
         if (NewBB)
           MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB);
         MadeChange = true;
@@ -2007,8 +2030,7 @@ bool ARMConstantIslands::ReorderThumb2JumpTables() {
 }
 
 MachineBasicBlock *ARMConstantIslands::
-AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB)
-{
+adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   // If the destination block is terminated by an unconditional branch,
   // try to move it; otherwise, create a new block following the jump
   // table that branches back to the actual target. This is a very simple
diff --git a/lib/Target/ARM/ARMELFWriterInfo.h b/lib/Target/ARM/ARMELFWriterInfo.h
index 1c4e532..6a84f8a 100644
--- a/lib/Target/ARM/ARMELFWriterInfo.h
+++ b/lib/Target/ARM/ARMELFWriterInfo.h
@@ -17,6 +17,7 @@
 #include "llvm/Target/TargetELFWriterInfo.h"
 
 namespace llvm {
+  class TargetMachine;
 
   class ARMELFWriterInfo : public TargetELFWriterInfo {
   public:
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index c2b7816..5fc0360 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -19,7 +19,6 @@
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
-#include "ARMRegisterInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -613,7 +612,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
   unsigned DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
@@ -794,15 +793,15 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                "base pointer without frame pointer?");
 
         if (AFI->isThumb2Function()) {
-          llvm::emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
-                                       FramePtr, -NumBytes, ARMCC::AL, 0, *TII);
+          emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                 FramePtr, -NumBytes, ARMCC::AL, 0, *TII);
         } else if (AFI->isThumbFunction()) {
-          llvm::emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
-                                          FramePtr, -NumBytes, *TII, RI);
+          emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                    FramePtr, -NumBytes, *TII, RI);
         } else {
-          llvm::emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
-                                        FramePtr, -NumBytes, ARMCC::AL, 0,
-                                        *TII);
+          emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                  FramePtr, -NumBytes, ARMCC::AL, 0,
+                                  *TII);
         }
         // If there's dynamic realignment, adjust for it.
         if (RI.needsStackRealignment(MF)) {
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index a24eab4..2e1eaca 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -16,7 +16,6 @@
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMCallingConv.h"
-#include "ARMRegisterInfo.h"
 #include "ARMTargetMachine.h"
 #include "ARMSubtarget.h"
 #include "ARMConstantPoolValue.h"
@@ -2112,13 +2111,10 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
 }
 
 unsigned ARMFastISel::ARMSelectCallOp(const GlobalValue *GV) {
-
-  // iOS needs the r9 versions of the opcodes.
-  bool isiOS = Subtarget->isTargetIOS();
   if (isThumb2) {
-    return isiOS ? ARM::tBLr9 : ARM::tBL;
+    return ARM::tBL;
   } else  {
-    return isiOS ? ARM::BLr9 : ARM::BL;
+    return ARM::BL;
   }
 }
 
@@ -2177,8 +2173,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
     return false;
 
-  // Issue the call, BLr9 for iOS, BL otherwise.
-  // TODO: Turn this into the table of arm call ops.
+  // Issue the call.
   MachineInstrBuilder MIB;
   unsigned CallOpc = ARMSelectCallOp(NULL);
   if (isThumb2)
@@ -2303,8 +2298,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
     return false;
 
-  // Issue the call, BLr9 for iOS, BL otherwise.
-  // TODO: Turn this into the table of arm call ops.
+  // Issue the call.
   MachineInstrBuilder MIB;
   unsigned CallOpc = ARMSelectCallOp(GV);
   // Explicitly adding the predicate here.
@@ -2350,7 +2344,8 @@ bool ARMFastISel::ARMIsMemCpySmall(uint64_t Len) {
   return Len <= 16;
 }
 
-bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len) {
+bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
+                                        uint64_t Len) {
   // Make sure we don't bloat code by inlining very large memcpy's.
   if (!ARMIsMemCpySmall(Len))
     return false;
@@ -2639,7 +2634,7 @@ bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
 }
 
 namespace llvm {
-  llvm::FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo) {
+  FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo) {
     // Completely untested on non-iOS.
     const TargetMachine &TM = funcInfo.MF->getTarget();
 
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index bd4b2a9..402ecb0 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -422,17 +422,16 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
   }
 
-  if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND ||
-      RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) {
+  if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri) {
     // Tail call return: adjust the stack pointer and jump to callee.
     MBBI = MBB.getLastNonDebugInstr();
     MachineOperand &JumpTarget = MBBI->getOperand(0);
 
     // Jump to label or value in register.
-    if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND) {
-      unsigned TCOpcode = (RetOpcode == ARM::TCRETURNdi)
-        ? (STI.isThumb() ? ARM::tTAILJMPd : ARM::TAILJMPd)
-        : (STI.isThumb() ? ARM::tTAILJMPdND : ARM::TAILJMPdND);
+    if (RetOpcode == ARM::TCRETURNdi) {
+      unsigned TCOpcode = STI.isThumb() ?
+               (STI.isTargetIOS() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) :
+               ARM::TAILJMPd;
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
       if (JumpTarget.isGlobal())
         MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
@@ -449,10 +448,6 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
       BuildMI(MBB, MBBI, dl,
               TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
         addReg(JumpTarget.getReg(), RegState::Kill);
-    } else if (RetOpcode == ARM::TCRETURNriND) {
-      BuildMI(MBB, MBBI, dl,
-              TII.get(STI.isThumb() ? ARM::tTAILJMPrND : ARM::TAILJMPrND)).
-        addReg(JumpTarget.getReg(), RegState::Kill);
     }
 
     MachineInstr *NewMI = prior(MBBI);
@@ -648,9 +643,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
   DebugLoc DL = MI->getDebugLoc();
   unsigned RetOpcode = MI->getOpcode();
   bool isTailCall = (RetOpcode == ARM::TCRETURNdi ||
-                     RetOpcode == ARM::TCRETURNdiND ||
-                     RetOpcode == ARM::TCRETURNri ||
-                     RetOpcode == ARM::TCRETURNriND);
+                     RetOpcode == ARM::TCRETURNri);
 
   SmallVector<unsigned, 4> Regs;
   unsigned i = CSI.size();
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index ffb9acb..1eafbbc 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2825,7 +2825,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     case MVT::v8i8:  Opc = ARM::VZIPd8; break;
     case MVT::v4i16: Opc = ARM::VZIPd16; break;
     case MVT::v2f32:
-    case MVT::v2i32: Opc = ARM::VZIPd32; break;
+    // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
     case MVT::v16i8: Opc = ARM::VZIPq8; break;
     case MVT::v8i16: Opc = ARM::VZIPq16; break;
     case MVT::v4f32:
@@ -2844,7 +2845,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     case MVT::v8i8:  Opc = ARM::VUZPd8; break;
     case MVT::v4i16: Opc = ARM::VUZPd16; break;
     case MVT::v2f32:
-    case MVT::v2i32: Opc = ARM::VUZPd32; break;
+    // vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
     case MVT::v16i8: Opc = ARM::VUZPq8; break;
     case MVT::v8i16: Opc = ARM::VUZPq16; break;
     case MVT::v4f32:
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e26dd22..a103c94 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -19,7 +19,6 @@
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMPerfectShuffle.h"
-#include "ARMRegisterInfo.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
@@ -508,7 +507,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
-    
+
     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
@@ -770,8 +769,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
 
-  setOperationAction(ISD::FMA, MVT::f64, Expand);
-  setOperationAction(ISD::FMA, MVT::f32, Expand);
+  if (!Subtarget->hasVFP4()) {
+    setOperationAction(ISD::FMA, MVT::f64, Expand);
+    setOperationAction(ISD::FMA, MVT::f32, Expand);
+  }
 
   // Various VFP goodness
   if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) {
@@ -1642,7 +1643,7 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 /// and then confiscate the rest of the parameter registers to insure
 /// this.
 void
-llvm::ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const {
+ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const {
   unsigned reg = State->AllocateReg(GPRArgRegs, 4);
   assert((State->getCallOrPrologue() == Prologue ||
           State->getCallOrPrologue() == Call) &&
@@ -1672,7 +1673,7 @@ llvm::ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const {
 static
 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
-                         const ARMInstrInfo *TII) {
+                         const TargetInstrInfo *TII) {
   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
   int FI = INT_MAX;
   if (Arg.getOpcode() == ISD::CopyFromReg) {
@@ -1807,8 +1808,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       // the caller's fixed stack objects.
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
-      const ARMInstrInfo *TII =
-        ((ARMTargetMachine&)getTargetMachine()).getInstrInfo();
+      const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
            i != e;
            ++i, ++realArgIdx) {
@@ -1936,63 +1936,72 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   return result;
 }
 
-bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N) const {
+bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   if (N->getNumValues() != 1)
     return false;
   if (!N->hasNUsesOfValue(1, 0))
     return false;
 
-  unsigned NumCopies = 0;
-  SDNode* Copies[2] = { 0, 0 };
-  SDNode *Use = *N->use_begin();
-  if (Use->getOpcode() == ISD::CopyToReg) {
-    Copies[NumCopies++] = Use;
-  } else if (Use->getOpcode() == ARMISD::VMOVRRD) {
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
+    SDNode *VMov = Copy;
     // f64 returned in a pair of GPRs.
-    for (SDNode::use_iterator UI = Use->use_begin(), UE = Use->use_end();
+    SmallPtrSet<SDNode*, 2> Copies;
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
          UI != UE; ++UI) {
       if (UI->getOpcode() != ISD::CopyToReg)
         return false;
-      Copies[UI.getUse().getResNo()] = *UI;
-      ++NumCopies;
+      Copies.insert(*UI);
     }
-  } else if (Use->getOpcode() == ISD::BITCAST) {
+    if (Copies.size() > 2)
+      return false;
+
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+         UI != UE; ++UI) {
+      SDValue UseChain = UI->getOperand(0);
+      if (Copies.count(UseChain.getNode()))
+        // Second CopyToReg
+        Copy = *UI;
+      else
+        // First CopyToReg
+        TCChain = UseChain;
+    }
+  } else if (Copy->getOpcode() == ISD::BITCAST) {
     // f32 returned in a single GPR.
-    if (!Use->hasNUsesOfValue(1, 0))
+    if (!Copy->hasOneUse())
       return false;
-    Use = *Use->use_begin();
-    if (Use->getOpcode() != ISD::CopyToReg || !Use->hasNUsesOfValue(1, 0))
+    Copy = *Copy->use_begin();
+    if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
       return false;
-    Copies[NumCopies++] = Use;
+    Chain = Copy->getOperand(0);
   } else {
     return false;
   }
 
-  if (NumCopies != 1 && NumCopies != 2)
-    return false;
-
   bool HasRet = false;
-  for (unsigned i = 0; i < NumCopies; ++i) {
-    SDNode *Copy = Copies[i];
-    for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
-         UI != UE; ++UI) {
-      if (UI->getOpcode() == ISD::CopyToReg) {
-        SDNode *Use = *UI;
-        if (Use == Copies[0] || ((NumCopies == 2) && (Use == Copies[1])))
-          continue;
-        return false;
-      }
-      if (UI->getOpcode() != ARMISD::RET_FLAG)
-        return false;
-      HasRet = true;
-    }
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != ARMISD::RET_FLAG)
+      return false;
+    HasRet = true;
   }
 
-  return HasRet;
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
 }
 
 bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!EnableARMTailCalls)
+  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
     return false;
 
   if (!CI->isTailCall())
@@ -3674,27 +3683,6 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   return Result;
 }
 
-SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
-                                           const ARMSubtarget *ST) const {
-  if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16())
-    return SDValue();
-
-  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
-  assert(Op.getValueType() == MVT::f32 &&
-         "ConstantFP custom lowering should only occur for f32.");
-
-  APFloat FPVal = CFP->getValueAPF();
-  int ImmVal = ARM_AM::getFP32Imm(FPVal);
-  if (ImmVal == -1)
-    return SDValue();
-
-  DebugLoc DL = Op.getDebugLoc();
-  SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
-  SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, NewVal);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
-                     DAG.getConstant(0, MVT::i32));
-}
-
 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
 /// valid vector constant for a NEON instruction with a "modified immediate"
 /// operand (e.g., VMOV).  If so, return the encoded value.
@@ -3831,6 +3819,58 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
   return DAG.getTargetConstant(EncodedVal, MVT::i32);
 }
 
+SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+                                           const ARMSubtarget *ST) const {
+  if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16())
+    return SDValue();
+
+  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
+  assert(Op.getValueType() == MVT::f32 &&
+         "ConstantFP custom lowering should only occur for f32.");
+
+  // Try splatting with a VMOV.f32...
+  APFloat FPVal = CFP->getValueAPF();
+  int ImmVal = ARM_AM::getFP32Imm(FPVal);
+  if (ImmVal != -1) {
+    DebugLoc DL = Op.getDebugLoc();
+    SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
+    SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
+                                      NewVal);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  // If that fails, try a VMOV.i32
+  EVT VMovVT;
+  unsigned iVal = FPVal.bitcastToAPInt().getZExtValue();
+  SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false,
+                                     VMOVModImm);
+  if (NewVal != SDValue()) {
+    DebugLoc DL = Op.getDebugLoc();
+    SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
+                                      NewVal);
+    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+                                       VecConstant);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  // Finally, try a VMVN.i32
+  NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false,
+                             VMVNModImm);
+  if (NewVal != SDValue()) {
+    DebugLoc DL = Op.getDebugLoc();
+    SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
+    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+                                       VecConstant);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+
 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
                        bool &ReverseVEXT, unsigned &Imm) {
   unsigned NumElts = VT.getVectorNumElements();
@@ -5795,7 +5835,8 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
   unsigned MaxCSNum = 0;
   MachineModuleInfo &MMI = MF->getMMI();
-  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) {
+  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
+       ++BB) {
     if (!BB->isLandingPad()) continue;
 
     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
@@ -5871,7 +5912,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
     BuildMI(DispatchBB, dl, TII->get(ARM::tInt_eh_sjlj_dispatchsetup));
   else if (!Subtarget->hasVFP2())
     BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup_nofp));
-  else 
+  else
     BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
 
   unsigned NumLPads = LPadList.size();
@@ -7308,15 +7349,99 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
 /// ISD::STORE.
 static SDValue PerformSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI) {
-  // Bitcast an i64 store extracted from a vector to f64.
-  // Otherwise, the i64 value will be legalized to a pair of i32 values.
   StoreSDNode *St = cast<StoreSDNode>(N);
+  if (St->isVolatile())
+    return SDValue();
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.  First, 
+  // pack all of the elements in one place.  Next, store to memory in fewer
+  // chunks.
   SDValue StVal = St->getValue();
-  if (!ISD::isNormalStore(St) || St->isVolatile())
+  EVT VT = StVal.getValueType();
+  if (St->isTruncatingStore() && VT.isVector()) {
+    SelectionDAG &DAG = DCI.DAG;
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    EVT StVT = St->getMemoryVT();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+
+    unsigned SizeRatio  = FromEltSz / ToEltSz;
+    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle.
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+                                     NumElems*SizeRatio);
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    DebugLoc DL = St->getDebugLoc();
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+        StoreType = Tp;
+    }
+    // Didn't find a legal store type.
+    if (!TLI.isTypeLegal(StoreType))
+      return SDValue();
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue BasePtr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
+    for (unsigned I = 0; I < E; I++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(I));
+      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+                            Increment);
+      Chains.push_back(Ch);
+    }
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
+                       Chains.size());
+  }
+
+  if (!ISD::isNormalStore(St))
     return SDValue();
 
+  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
+  // ARM stores of arguments in the same cache line.
   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
-      StVal.getNode()->hasOneUse() && !St->isVolatile()) {
+      StVal.getNode()->hasOneUse()) {
     SelectionDAG  &DAG = DCI.DAG;
     DebugLoc DL = St->getDebugLoc();
     SDValue BasePtr = St->getBasePtr();
@@ -7337,6 +7462,8 @@ static SDValue PerformSTORECombine(SDNode *N,
       StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
     return SDValue();
 
+  // Bitcast an i64 store extracted from a vector to f64.
+  // Otherwise, the i64 value will be legalized to a pair of i32 values.
   SelectionDAG &DAG = DCI.DAG;
   DebugLoc dl = StVal.getDebugLoc();
   SDValue IntVec = StVal.getOperand(0);
@@ -8259,8 +8386,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
 
   if (Res.getNode()) {
     APInt KnownZero, KnownOne;
-    APInt Mask = APInt::getAllOnesValue(VT.getScalarType().getSizeInBits());
-    DAG.ComputeMaskedBits(SDValue(N,0), Mask, KnownZero, KnownOne);
+    DAG.ComputeMaskedBits(SDValue(N,0), KnownZero, KnownOne);
     // Capture demanded bits information that would be otherwise lost.
     if (KnownZero == 0xfffffffe)
       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
@@ -8586,10 +8712,12 @@ bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 /// a register against the immediate without having to materialize the
 /// immediate into a register.
 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  // Thumb2 and ARM modes can use cmn for negative immediates.
   if (!Subtarget->isThumb())
-    return ARM_AM::getSOImmVal(Imm) != -1;
+    return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1;
   if (Subtarget->isThumb2())
-    return ARM_AM::getT2SOImmVal(Imm) != -1;
+    return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1;
+  // Thumb1 doesn't have cmn, and only 8-bit immediates.
   return Imm >= 0 && Imm <= 255;
 }
 
@@ -8776,22 +8904,20 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 }
 
 void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       const APInt &Mask,
                                                        APInt &KnownZero,
                                                        APInt &KnownOne,
                                                        const SelectionDAG &DAG,
                                                        unsigned Depth) const {
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
   switch (Op.getOpcode()) {
   default: break;
   case ARMISD::CMOV: {
     // Bits are known zero/one if known on the LHS and RHS.
-    DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     if (KnownZero == 0 && KnownOne == 0) return;
 
     APInt KnownZeroRHS, KnownOneRHS;
-    DAG.ComputeMaskedBits(Op.getOperand(1), Mask,
-                          KnownZeroRHS, KnownOneRHS, Depth+1);
+    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
     KnownZero &= KnownZeroRHS;
     KnownOne  &= KnownOneRHS;
     return;
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index a71b74e..352d980 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -315,7 +315,6 @@ namespace llvm {
                                             SelectionDAG &DAG) const;
 
     virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                const APInt &Mask,
                                                 APInt &KnownZero,
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
@@ -494,7 +493,7 @@ namespace llvm {
                   const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
-    virtual bool isUsedByReturnOnly(SDNode *N) const;
+    virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
 
     virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 1d38bcf..f04926a 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -532,6 +532,7 @@ class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
   let Inst{11-4} = 0b00001001;
   let Inst{3-0} = Rt2;
 
+  let Unpredictable{11-8} = 0b1111;
   let DecoderMethod = "DecodeSwap";
 }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index 72af535..5d3e059 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@@ -18,7 +18,6 @@
 #include "ARMBaseInstrInfo.h"
 #include "ARMRegisterInfo.h"
 #include "ARMSubtarget.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
   class ARMSubtarget;
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 8196582..1eb561d 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -181,12 +181,8 @@ def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
                                  AssemblerPredicate<"FeatureVFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
                                  AssemblerPredicate<"FeatureVFP4">;
-def NoVFP4            : Predicate<"!Subtarget->hasVFP4()">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON">;
-def HasNEON2         : Predicate<"Subtarget->hasNEON2()">,
-                                 AssemblerPredicate<"FeatureNEON2">;
-def NoNEON2          : Predicate<"!Subtarget->hasNEON2()">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
@@ -221,6 +217,14 @@ def UseMovt          : Predicate<"Subtarget->useMovt()">;
 def DontUseMovt      : Predicate<"!Subtarget->useMovt()">;
 def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
 
+// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
+// But only select them if more precision in FP computation is allowed.
+// Do not use them for Darwin platforms.
+def UseFusedMAC      : Predicate<"!TM.Options.NoExcessFPPrecision && "
+                                 "!Subtarget->isTargetDarwin()">;
+def DontUseFusedMAC  : Predicate<"!Subtarget->hasVFP4() || "
+                                 "Subtarget->isTargetDarwin()">;
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
 
@@ -251,7 +255,8 @@ def imm16_31 : ImmLeaf<i32, [{
 
 def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
 def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
-    return ARM_AM::getSOImmVal(-(uint32_t)N->getZExtValue()) != -1;
+    int64_t Value = -(int)N->getZExtValue();
+    return Value && ARM_AM::getSOImmVal(Value) != -1;
   }], so_imm_neg_XFORM> {
   let ParserMatchClass = so_imm_neg_asmoperand;
 }
@@ -736,7 +741,7 @@ def postidx_reg : Operand<i32> {
   let DecoderMethod = "DecodePostIdxReg";
   let PrintMethod = "printPostIdxRegOperand";
   let ParserMatchClass = PostIdxRegAsmOperand;
-  let MIOperandInfo = (ops GPR, i32imm);
+  let MIOperandInfo = (ops GPRnopc, i32imm);
 }
 
 
@@ -903,6 +908,11 @@ def p_imm : Operand<i32> {
   let DecoderMethod = "DecodeCoprocessor";
 }
 
+def pf_imm : Operand<i32> {
+  let PrintMethod = "printPImmediate";
+  let ParserMatchClass = CoprocNumAsmOperand;
+}
+
 def CoprocRegAsmOperand : AsmOperandClass {
   let Name = "CoprocReg";
   let ParserMethod = "parseCoprocRegOperand";
@@ -1182,6 +1192,8 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc,
     let Inst{19-16} = Rn;
     let Inst{15-12} = 0b0000;
     let Inst{11-0} = imm;
+
+    let Unpredictable{15-12} = 0b1111;
   }
   def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir,
                opc, "\t$Rn, $Rm",
@@ -1195,6 +1207,8 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc,
     let Inst{15-12} = 0b0000;
     let Inst{11-4} = 0b00000000;
     let Inst{3-0} = Rm;
+
+    let Unpredictable{15-12} = 0b1111;
   }
   def rsi : AI1<opcod, (outs),
                (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis,
@@ -1209,11 +1223,13 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc,
     let Inst{11-5} = shift{11-5};
     let Inst{4} = 0;
     let Inst{3-0} = shift{3-0};
+
+    let Unpredictable{15-12} = 0b1111;
   }
   def rsr : AI1<opcod, (outs),
-               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis,
+               (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis,
                opc, "\t$Rn, $shift",
-               [(opnode GPR:$Rn, so_reg_reg:$shift)]> {
+               [(opnode GPRnopc:$Rn, so_reg_reg:$shift)]> {
     bits<4> Rn;
     bits<12> shift;
     let Inst{25} = 0;
@@ -1225,6 +1241,8 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc,
     let Inst{6-5} = shift{6-5};
     let Inst{4} = 1;
     let Inst{3-0} = shift{3-0};
+
+    let Unpredictable{15-12} = 0b1111;
   }
 
 }
@@ -1330,10 +1348,10 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
     let Inst{4} = 0;
     let Inst{3-0} = shift{3-0};
   }
-  def rsr : AsI1<opcod, (outs GPR:$Rd),
-                (ins GPR:$Rn, so_reg_reg:$shift),
+  def rsr : AsI1<opcod, (outs GPRnopc:$Rd),
+                (ins GPRnopc:$Rn, so_reg_reg:$shift),
                 DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
-              [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_reg:$shift, CPSR))]>,
+              [(set GPRnopc:$Rd, CPSR, (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
                Requires<[IsARM]> {
     bits<4> Rd;
     bits<4> Rn;
@@ -1367,7 +1385,7 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
   def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
+     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPRnopc:$Rdn, GPRnopc:$Rdn,
                                                     so_reg_reg:$shift, pred:$p,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
@@ -1907,7 +1925,7 @@ let isCall = 1,
   def BL  : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops),
                 IIC_Br, "bl\t$func",
                 [(ARMcall tglobaladdr:$func)]>,
-            Requires<[IsARM, IsNotIOS]> {
+            Requires<[IsARM]> {
     let Inst{31-28} = 0b1110;
     bits<24> func;
     let Inst{23-0} = func;
@@ -1917,7 +1935,7 @@ let isCall = 1,
   def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func, variable_ops),
                    IIC_Br, "bl", "\t$func",
                    [(ARMcall_pred tglobaladdr:$func)]>,
-                Requires<[IsARM, IsNotIOS]> {
+                Requires<[IsARM]> {
     bits<24> func;
     let Inst{23-0} = func;
     let DecoderMethod = "DecodeBranchImmInstruction";
@@ -1927,7 +1945,7 @@ let isCall = 1,
   def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
                 IIC_Br, "blx\t$func",
                 [(ARMcall GPR:$func)]>,
-            Requires<[IsARM, HasV5T, IsNotIOS]> {
+            Requires<[IsARM, HasV5T]> {
     bits<4> func;
     let Inst{31-4} = 0b1110000100101111111111110011;
     let Inst{3-0}  = func;
@@ -1936,7 +1954,7 @@ let isCall = 1,
   def BLX_pred : AI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
                     IIC_Br, "blx", "\t$func",
                     [(ARMcall_pred GPR:$func)]>,
-                 Requires<[IsARM, HasV5T, IsNotIOS]> {
+                 Requires<[IsARM, HasV5T]> {
     bits<4> func;
     let Inst{27-4} = 0b000100101111111111110011;
     let Inst{3-0}  = func;
@@ -1946,67 +1964,19 @@ let isCall = 1,
   // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
   def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
-                   Requires<[IsARM, HasV4T, IsNotIOS]>;
+                   Requires<[IsARM, HasV4T]>;
 
   // ARMv4
   def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
-                   Requires<[IsARM, NoV4T, IsNotIOS]>;
+                   Requires<[IsARM, NoV4T]>;
 
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
   def BMOVPCB_CALL : ARMPseudoInst<(outs),
                                    (ins bl_target:$func, variable_ops),
                                8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
-                      Requires<[IsARM, IsNotIOS]>;
-}
-
-let isCall = 1,
-  // On IOS R9 is call-clobbered.
-  // R7 is marked as a use to prevent frame-pointer assignments from being
-  // moved above / below calls.
-  Defs = [LR], Uses = [R7, SP] in {
-  def BLr9  : ARMPseudoExpand<(outs), (ins bl_target:$func, variable_ops),
-                4, IIC_Br,
-                [(ARMcall tglobaladdr:$func)], (BL bl_target:$func)>,
-              Requires<[IsARM, IsIOS]>;
-
-  def BLr9_pred : ARMPseudoExpand<(outs),
-                   (ins bl_target:$func, pred:$p, variable_ops),
-                   4, IIC_Br,
-                   [(ARMcall_pred tglobaladdr:$func)],
-                   (BL_pred bl_target:$func, pred:$p)>,
-                  Requires<[IsARM, IsIOS]>;
-
-  // ARMv5T and above
-  def BLXr9 : ARMPseudoExpand<(outs), (ins GPR:$func, variable_ops),
-                4, IIC_Br,
-                [(ARMcall GPR:$func)],
-                (BLX GPR:$func)>,
-               Requires<[IsARM, HasV5T, IsIOS]>;
-
-  def BLXr9_pred: ARMPseudoExpand<(outs), (ins GPR:$func, pred:$p,variable_ops),
-                4, IIC_Br,
-                [(ARMcall_pred GPR:$func)],
-                (BLX_pred GPR:$func, pred:$p)>,
-                   Requires<[IsARM, HasV5T, IsIOS]>;
-
-  // ARMv4T
-  // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
-  def BXr9_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
-                  8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
-                  Requires<[IsARM, HasV4T, IsIOS]>;
-
-  // ARMv4
-  def BMOVPCRXr9_CALL : ARMPseudoInst<(outs), (ins tGPR:$func, variable_ops),
-                  8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
-                  Requires<[IsARM, NoV4T, IsIOS]>;
-
-  // mov lr, pc; b if callee is marked noreturn to avoid confusing the
-  // return stack predictor.
-  def BMOVPCBr9_CALL : ARMPseudoInst<(outs),(ins bl_target:$func, variable_ops),
-                               8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
-                        Requires<[IsARM, IsIOS]>;
+                      Requires<[IsARM]>;
 }
 
 let isBranch = 1, isTerminator = 1 in {
@@ -2073,45 +2043,22 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
 
 // Tail calls.
 
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
-  // IOS versions.
-  let Uses = [SP] in {
-    def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
-                       IIC_Br, []>, Requires<[IsIOS]>;
-
-    def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
-                       IIC_Br, []>, Requires<[IsIOS]>;
-
-    def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst, variable_ops),
-                   4, IIC_Br, [],
-                   (Bcc br_target:$dst, (ops 14, zero_reg))>,
-                   Requires<[IsARM, IsIOS]>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
+                              IIC_Br, []>;
 
-    def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
-                   4, IIC_Br, [],
-                   (BX GPR:$dst)>,
-                   Requires<[IsARM, IsIOS]>;
+  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
+                              IIC_Br, []>;
 
-  }
-
-  // Non-IOS versions (the difference is R9).
-  let Uses = [SP] in {
-    def TCRETURNdiND : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
-                       IIC_Br, []>, Requires<[IsNotIOS]>;
-
-    def TCRETURNriND : PseudoInst<(outs), (ins tcGPR:$dst, variable_ops),
-                       IIC_Br, []>, Requires<[IsNotIOS]>;
+  def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst, variable_ops),
+                                 4, IIC_Br, [],
+                                 (Bcc br_target:$dst, (ops 14, zero_reg))>,
+                                 Requires<[IsARM]>;
 
-    def TAILJMPdND : ARMPseudoExpand<(outs), (ins brtarget:$dst, variable_ops),
-                   4, IIC_Br, [],
-                   (Bcc br_target:$dst, (ops 14, zero_reg))>,
-                   Requires<[IsARM, IsNotIOS]>;
-
-    def TAILJMPrND : ARMPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
-                     4, IIC_Br, [],
-                     (BX GPR:$dst)>,
-                     Requires<[IsARM, IsNotIOS]>;
-  }
+  def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
+                                 4, IIC_Br, [],
+                                 (BX GPR:$dst)>,
+                                 Requires<[IsARM]>;
 }
 
 // Secure Monitor Call is a system instruction.
@@ -2484,7 +2431,7 @@ multiclass AI3ldrT<bits<4> op, string opc> {
     let Inst{3-0} = offset{3-0};
     let AsmMatchConverter = "cvtLdExtTWriteBackImm";
   }
-  def r : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
+  def r : AI3ldstidxT<op, 1, (outs GPRnopc:$Rt, GPRnopc:$base_wb),
                       (ins addr_offset_none:$addr, postidx_reg:$Rm),
                       IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
                       "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
@@ -2492,8 +2439,10 @@ multiclass AI3ldrT<bits<4> op, string opc> {
     let Inst{23} = Rm{4};
     let Inst{22} = 0;
     let Inst{11-8} = 0;
+    let Unpredictable{11-8} = 0b1111;
     let Inst{3-0} = Rm{3-0};
     let AsmMatchConverter = "cvtLdExtTWriteBackReg";
+    let DecoderMethod = "DecodeLDR";
   }
 }
 
@@ -3241,6 +3190,8 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
   let Inst{19-16} = Rn;
   let Inst{15-12} = Rd;
   let Inst{3-0}   = Rm;
+  
+  let Unpredictable{11-8} = 0b1111;
 }
 
 // Saturating add/subtract
@@ -3533,19 +3484,20 @@ class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
 //        property. Remove them when it's possible to add those properties
 //        on an individual MachineInstr, not just an instuction description.
 let isCommutable = 1 in {
-def MUL  : AsMul1I32<0b0000000, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+def MUL  : AsMul1I32<0b0000000, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
                    IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
-                   [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))]>,
+                   [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
                    Requires<[IsARM, HasV6]> {
   let Inst{15-12} = 0b0000;
+  let Unpredictable{15-12} = 0b1111;
 }
 
 let Constraints = "@earlyclobber $Rd" in
-def MULv5: ARMPseudoExpand<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm,
+def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
                                             pred:$p, cc_out:$s),
                           4, IIC_iMUL32,
-                         [(set GPR:$Rd, (mul GPR:$Rn, GPR:$Rm))],
-                         (MUL GPR:$Rd, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+                         [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
+                         (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
                         Requires<[IsARM, NoV6]>;
 }
 
@@ -4040,10 +3992,13 @@ def BCCZi64 : PseudoInst<(outs),
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
 // a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
+
+let isCommutable = 1 in
 def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p),
                            4, IIC_iCMOVr,
   [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
       RegConstraint<"$false = $Rd">;
+
 def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
                            (ins GPR:$false, so_reg_imm:$shift, pred:$p),
                            4, IIC_iCMOVsr,
@@ -4164,7 +4119,7 @@ def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
   let Inst{3-0} = opt;
 }
 
-// Pseudo isntruction that combines movs + predicated rsbmi
+// Pseudo instruction that combines movs + predicated rsbmi
 // to implement integer ABS
 let usesCustomInserter = 1, Defs = [CPSR] in {
 def ABS : ARMPseudoInst<
@@ -4325,9 +4280,9 @@ def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>,
 
 // SWP/SWPB are deprecated in V6/V7.
 let mayLoad = 1, mayStore = 1 in {
-def SWP : AIswp<0, (outs GPR:$Rt), (ins GPR:$Rt2, addr_offset_none:$addr),
+def SWP : AIswp<0, (outs GPRnopc:$Rt), (ins GPRnopc:$Rt2, addr_offset_none:$addr),
                 "swp", []>;
-def SWPB: AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, addr_offset_none:$addr),
+def SWPB: AIswp<1, (outs GPRnopc:$Rt), (ins GPRnopc:$Rt2, addr_offset_none:$addr),
                 "swpb", []>;
 }
 
@@ -4356,7 +4311,7 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{23-20} = opc1;
 }
 
-def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+def CDP2 : ABXI<0b1110, (outs), (ins pf_imm:$cop, imm0_15:$opc1,
                c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
@@ -4635,7 +4590,7 @@ def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
 
 class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
   : ABI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
-        GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
+        GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm),
         NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -4654,13 +4609,13 @@ class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
 }
 
 def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
-                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2,
                                      imm:$CRm)]>;
 def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 
 class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
-         GPR:$Rt, GPR:$Rt2, c_imm:$CRm), NoItinerary,
+         GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
   let Inst{31-28} = 0b1111;
   let Inst{23-21} = 0b010;
@@ -4677,10 +4632,12 @@ class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   let Inst{11-8}  = cop;
   let Inst{7-4}   = opc1;
   let Inst{3-0}   = CRm;
+
+  let DecoderMethod = "DecodeMRRC2";
 }
 
 def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
-                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt, GPRnopc:$Rt2,
                                         imm:$CRm)]>;
 def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
 
@@ -4689,22 +4646,32 @@ def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
 //
 
 // Move to ARM core register from Special Register
-def MRS : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,
+def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
               "mrs", "\t$Rd, apsr", []> {
   bits<4> Rd;
   let Inst{23-16} = 0b00001111;
+  let Unpredictable{19-17} = 0b111;
+
   let Inst{15-12} = Rd;
-  let Inst{7-4} = 0b0000;
+
+  let Inst{11-0} = 0b000000000000;
+  let Unpredictable{11-0} = 0b110100001111;
 }
 
-def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPR:$Rd, pred:$p)>, Requires<[IsARM]>;
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>, Requires<[IsARM]>;
 
-def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,
+// The MRSsys instruction is the MRS instruction from the ARM ARM,
+// section B9.3.9, with the R bit set to 1.
+def MRSsys : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
                  "mrs", "\t$Rd, spsr", []> {
   bits<4> Rd;
   let Inst{23-16} = 0b01001111;
+  let Unpredictable{19-16} = 0b1111;
+
   let Inst{15-12} = Rd;
-  let Inst{7-4} = 0b0000;
+
+  let Inst{11-0} = 0b000000000000;
+  let Unpredictable{11-0} = 0b110100001111;
 }
 
 // Move from ARM core register to Special Register
@@ -4868,36 +4835,15 @@ def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
 
 // TODO: add,sub,and, 3-instr forms?
 
-// Tail calls
-def : ARMPat<(ARMtcret tcGPR:$dst),
-          (TCRETURNri tcGPR:$dst)>, Requires<[IsIOS]>;
-
-def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
-          (TCRETURNdi texternalsym:$dst)>, Requires<[IsIOS]>;
-
-def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
-          (TCRETURNdi texternalsym:$dst)>, Requires<[IsIOS]>;
-
-def : ARMPat<(ARMtcret tcGPR:$dst),
-          (TCRETURNriND tcGPR:$dst)>, Requires<[IsNotIOS]>;
-
-def : ARMPat<(ARMtcret (i32 tglobaladdr:$dst)),
-          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotIOS]>;
-
-def : ARMPat<(ARMtcret (i32 texternalsym:$dst)),
-          (TCRETURNdiND texternalsym:$dst)>, Requires<[IsNotIOS]>;
+// Tail calls. These patterns also apply to Thumb mode.
+def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>;
+def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
 
 // Direct calls
-def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
-      Requires<[IsARM, IsNotIOS]>;
-def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>,
-      Requires<[IsARM, IsIOS]>;
-def : ARMPat<(ARMcall_nolink texternalsym:$func),
-             (BMOVPCB_CALL texternalsym:$func)>,
-      Requires<[IsARM, IsNotIOS]>;
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
 def : ARMPat<(ARMcall_nolink texternalsym:$func),
-             (BMOVPCBr9_CALL texternalsym:$func)>,
-      Requires<[IsARM, IsIOS]>;
+             (BMOVPCB_CALL texternalsym:$func)>;
 
 // zextload i1 -> zextload i8
 def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index f61eb2b..fd8ac0b 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -530,16 +530,16 @@ def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{
 // Use VLDM to load a Q register as a D register pair.
 // This is a pseudo instruction that is expanded to VLDMD after reg alloc.
 def VLDMQIA
-  : PseudoVFPLdStM<(outs QPR:$dst), (ins GPR:$Rn),
+  : PseudoVFPLdStM<(outs DPair:$dst), (ins GPR:$Rn),
                     IIC_fpLoad_m, "",
-                   [(set QPR:$dst, (v2f64 (load GPR:$Rn)))]>;
+                   [(set DPair:$dst, (v2f64 (load GPR:$Rn)))]>;
 
 // Use VSTM to store a Q register as a D register pair.
 // This is a pseudo instruction that is expanded to VSTMD after reg alloc.
 def VSTMQIA
-  : PseudoVFPLdStM<(outs), (ins QPR:$src, GPR:$Rn),
+  : PseudoVFPLdStM<(outs), (ins DPair:$src, GPR:$Rn),
                     IIC_fpStore_m, "",
-                   [(store (v2f64 QPR:$src), GPR:$Rn)]>;
+                   [(store (v2f64 DPair:$src), GPR:$Rn)]>;
 
 // Classes for VLD* pseudo-instructions with multi-register operands.
 // These are expanded to real instructions after register allocation.
@@ -1938,20 +1938,11 @@ class VSTQQQQLNWBPseudo<InstrItinClass itin>
 
 //   VST1LN   : Vector Store (single element from one lane)
 class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
-             PatFrag StoreOp, SDNode ExtractOp>
+             PatFrag StoreOp, SDNode ExtractOp, Operand AddrMode>
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
-          (ins addrmode6:$Rn, DPR:$Vd, nohash_imm:$lane),
+          (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane),
           IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
-          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6:$Rn)]> {
-  let Rm = 0b1111;
-  let DecoderMethod = "DecodeVST1LN";
-}
-class VST1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
-             PatFrag StoreOp, SDNode ExtractOp>
-  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
-          (ins addrmode6oneL32:$Rn, DPR:$Vd, nohash_imm:$lane),
-          IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
-          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6oneL32:$Rn)]>{
+          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> {
   let Rm = 0b1111;
   let DecoderMethod = "DecodeVST1LN";
 }
@@ -1962,16 +1953,17 @@ class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
 }
 
 def VST1LNd8  : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
-                       NEONvgetlaneu> {
+                       NEONvgetlaneu, addrmode6> {
   let Inst{7-5} = lane{2-0};
 }
 def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
-                       NEONvgetlaneu> {
+                       NEONvgetlaneu, addrmode6> {
   let Inst{7-6} = lane{1-0};
   let Inst{4}   = Rn{5};
 }
 
-def VST1LNd32 : VST1LN32<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> {
+def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt, 
+                       addrmode6oneL32> {
   let Inst{7}   = lane{0};
   let Inst{5-4} = Rn{5-4};
 }
@@ -1987,14 +1979,14 @@ def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
 
 // ...with address register writeback:
 class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
-               PatFrag StoreOp, SDNode ExtractOp>
+               PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode>
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
-          (ins addrmode6:$Rn, am6offset:$Rm,
+          (ins AdrMode:$Rn, am6offset:$Rm,
            DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
           "$Rn.addr = $wb",
           [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
-                                  addrmode6:$Rn, am6offset:$Rm))]> {
+                                  AdrMode:$Rn, am6offset:$Rm))]> {
   let DecoderMethod = "DecodeVST1LN";
 }
 class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
@@ -2004,16 +1996,16 @@ class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
 }
 
 def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
-                             NEONvgetlaneu> {
+                             NEONvgetlaneu, addrmode6> {
   let Inst{7-5} = lane{2-0};
 }
 def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
-                             NEONvgetlaneu> {
+                             NEONvgetlaneu, addrmode6> {
   let Inst{7-6} = lane{1-0};
   let Inst{4}   = Rn{5};
 }
 def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
-                             extractelt> {
+                             extractelt, addrmode6oneL32> {
   let Inst{7}   = lane{0};
   let Inst{5-4} = Rn{5-4};
 }
@@ -3642,7 +3634,7 @@ multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
 }
 multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itin, string OpcodeStr, string Dt,
-                       SDNode OpNode> {
+                       string baseOpc, SDNode OpNode> {
   // 64-bit vector types.
   def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
                      OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
@@ -3676,6 +3668,33 @@ multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
   def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
                      OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
                              // imm6 = xxxxxx
+
+  // Aliases for two-operand forms (source and dest regs the same).
+  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "8 $Vdn, $imm"),
+                      (!cast<Instruction>(!strconcat(baseOpc, "v8i8"))
+                          DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
+  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "16 $Vdn, $imm"),
+                      (!cast<Instruction>(!strconcat(baseOpc, "v4i16"))
+                          DPR:$Vdn, DPR:$Vdn, shr_imm16:$imm, pred:$p)>;
+  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "32 $Vdn, $imm"),
+                      (!cast<Instruction>(!strconcat(baseOpc, "v2i32"))
+                          DPR:$Vdn, DPR:$Vdn, shr_imm32:$imm, pred:$p)>;
+  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "64 $Vdn, $imm"),
+                      (!cast<Instruction>(!strconcat(baseOpc, "v1i64"))
+                          DPR:$Vdn, DPR:$Vdn, shr_imm64:$imm, pred:$p)>;
+
+  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "8 $Vdn, $imm"),
+                      (!cast<Instruction>(!strconcat(baseOpc, "v16i8"))
+                          QPR:$Vdn, QPR:$Vdn, shr_imm8:$imm, pred:$p)>;
+  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "16 $Vdn, $imm"),
+                      (!cast<Instruction>(!strconcat(baseOpc, "v8i16"))
+                          QPR:$Vdn, QPR:$Vdn, shr_imm16:$imm, pred:$p)>;
+  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "32 $Vdn, $imm"),
+                      (!cast<Instruction>(!strconcat(baseOpc, "v4i32"))
+                          QPR:$Vdn, QPR:$Vdn, shr_imm32:$imm, pred:$p)>;
+  def : NEONInstAlias<!strconcat(OpcodeStr, "${p}.", Dt, "64 $Vdn, $imm"),
+                      (!cast<Instruction>(!strconcat(baseOpc, "v2i64"))
+                          QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
 }
 
 // Neon Shift-Accumulate vector operations,
@@ -3986,10 +4005,10 @@ defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
 def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
                           v2f32, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, UseFPVMLx, NoNEON2]>;
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
 def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
                           v4f32, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, UseFPVMLx, NoNEON2]>;
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
 defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
 def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
@@ -4044,10 +4063,10 @@ defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
 def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
                           v2f32, fmul_su, fsub_mlx>,
-                Requires<[HasNEON, UseFPVMLx, NoNEON2]>;
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
 def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
                           v4f32, fmul_su, fsub_mlx>,
-                Requires<[HasNEON, UseFPVMLx, NoNEON2]>;
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
 defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
 def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
@@ -4096,23 +4115,36 @@ defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
                             "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
 defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
 
-
 // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
 def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
                           v2f32, fmul_su, fadd_mlx>,
-                Requires<[HasNEON2,FPContractions]>;
+                Requires<[HasVFP4,UseFusedMAC]>;
 
 def  VFMAfq   : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
                           v4f32, fmul_su, fadd_mlx>,
-                Requires<[HasNEON2,FPContractions]>;
+                Requires<[HasVFP4,UseFusedMAC]>;
 
 //   Fused Vector Multiply Subtract (floating-point)
 def  VFMSfd   : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
                           v2f32, fmul_su, fsub_mlx>,
-                Requires<[HasNEON2,FPContractions]>;
+                Requires<[HasVFP4,UseFusedMAC]>;
 def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
                           v4f32, fmul_su, fsub_mlx>,
-                Requires<[HasNEON2,FPContractions]>;
+                Requires<[HasVFP4,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+def : Pat<(v2f32 (fma DPR:$src1, DPR:$Vn, DPR:$Vm)),
+          (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+          Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma QPR:$src1, QPR:$Vn, QPR:$Vm)),
+          (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+          Requires<[HasVFP4]>;
+def : Pat<(v2f32 (fma (fneg DPR:$src1), DPR:$Vn, DPR:$Vm)),
+          (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma (fneg QPR:$src1), QPR:$Vn, QPR:$Vm)),
+          (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+      Requires<[HasVFP4]>;
 
 // Vector Subtract Operations.
 
@@ -4614,8 +4646,10 @@ defm VSHLu    : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm,
 defm VSHLi    : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
 
 //   VSHR     : Vector Shift Right (Immediate)
-defm VSHRs    : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s",NEONvshrs>;
-defm VSHRu    : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u",NEONvshru>;
+defm VSHRs    : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs",
+                            NEONvshrs>;
+defm VSHRu    : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu",
+                            NEONvshru>;
 
 //   VSHLL    : Vector Shift Left Long
 defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>;
@@ -4649,8 +4683,10 @@ defm VRSHLu   : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm,
                             IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
                             "vrshl", "u", int_arm_neon_vrshiftu>;
 //   VRSHR    : Vector Rounding Shift Right
-defm VRSHRs   : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s",NEONvrshrs>;
-defm VRSHRu   : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u",NEONvrshru>;
+defm VRSHRs   : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs",
+                            NEONvrshrs>;
+defm VRSHRu   : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu",
+                            NEONvrshru>;
 
 //   VRSHRN   : Vector Rounding Shift Right and Narrow
 defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
@@ -4795,12 +4831,12 @@ def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
 
 // Vector Swap
 def  VSWPd    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
-                     (outs DPR:$Vd, DPR:$Vd1), (ins DPR:$Vm, DPR:$Vm1),
-                     NoItinerary, "vswp", "$Vd, $Vd1", "$Vm = $Vd, $Vm1 = $Vd1",
+                     (outs DPR:$Vd, DPR:$Vm), (ins DPR:$in1, DPR:$in2),
+                     NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
                      []>;
 def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
-                     (outs QPR:$Vd, QPR:$Vd1), (ins QPR:$Vm, QPR:$Vm1),
-                     NoItinerary, "vswp", "$Vd, $Vd1", "$Vm = $Vd, $Vm1 = $Vd1",
+                     (outs QPR:$Vd, QPR:$Vm), (ins QPR:$in1, QPR:$in2),
+                     NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
                      []>;
 
 // Vector Move Operations.
@@ -5342,7 +5378,9 @@ def  VTRNq32  : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn", "32">;
 
 def  VUZPd8   : N2VDShuffle<0b00, 0b00010, "vuzp", "8">;
 def  VUZPd16  : N2VDShuffle<0b01, 0b00010, "vuzp", "16">;
-def  VUZPd32  : N2VDShuffle<0b10, 0b00010, "vuzp", "32">;
+// vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vuzp${p}.32 $Dd, $Dm",
+                    (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
 
 def  VUZPq8   : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp", "8">;
 def  VUZPq16  : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp", "16">;
@@ -5352,7 +5390,9 @@ def  VUZPq32  : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp", "32">;
 
 def  VZIPd8   : N2VDShuffle<0b00, 0b00011, "vzip", "8">;
 def  VZIPd16  : N2VDShuffle<0b01, 0b00011, "vzip", "16">;
-def  VZIPd32  : N2VDShuffle<0b10, 0b00011, "vzip", "32">;
+// vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vzip${p}.32 $Dd, $Dm",
+                    (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
 
 def  VZIPq8   : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip", "8">;
 def  VZIPq16  : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip", "16">;
@@ -5462,13 +5502,13 @@ def : N3VSPat<fadd, VADDfd>;
 def : N3VSPat<fsub, VSUBfd>;
 def : N3VSPat<fmul, VMULfd>;
 def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
-      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>;
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
 def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
-      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>;
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
 def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
-      Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
+      Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
 def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
-      Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
+      Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
 def : N2VSPat<fabs, VABSfd>;
 def : N2VSPat<fneg, VNEGfd>;
 def : N3VSPat<NEONfmax, VMAXfd>;
@@ -5594,6 +5634,7 @@ multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
 
 // extload, zextload and sextload for a lengthening load followed by another
 // lengthening load, to quadruple the initial length.
+//
 // Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32", qsub_0> =
 //     Pat<(v4i32 (extloadvi8 addrmode5:$addr))
 //         (EXTRACT_SUBREG (VMOVLuv4i32 
@@ -5604,28 +5645,63 @@ multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
 //           qsub_0)>;
 multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
                            string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
-                           string Insn2Ty, SubRegIndex RegType> {
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
+              ssub_0)), dsub_0))>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
+              ssub_0)), dsub_0))>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
+         (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+             (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
+              ssub_0)), dsub_0))>;
+}
+
+// extload, zextload and sextload for a lengthening load followed by another
+// lengthening load, to quadruple the initial length, but which ends up only
+// requiring half the available lanes (a 64-bit outcome instead of a 128-bit).
+//
+// Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> =
+//     Pat<(v4i32 (extloadvi8 addrmode5:$addr))
+//         (EXTRACT_SUBREG (VMOVLuv4i32 
+//           (EXTRACT_SUBREG (VMOVLuv8i16 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+//                                                       (VLDRS addrmode5:$addr),
+//                                                       ssub_0)),
+//                           dsub_0)),
+//           dsub_0)>;
+multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
   def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
                    (!cast<PatFrag>("extloadv" # SrcTy) addrmode5:$addr)),
          (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
              (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
               ssub_0)), dsub_0)),
-          RegType)>;
+          dsub_0)>;
   def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
                    (!cast<PatFrag>("zextloadv" # SrcTy) addrmode5:$addr)),
          (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
              (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
               ssub_0)), dsub_0)),
-          RegType)>;
+          dsub_0)>;
   def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
                    (!cast<PatFrag>("sextloadv" # SrcTy) addrmode5:$addr)),
          (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
            (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
              (INSERT_SUBREG (f64 (IMPLICIT_DEF)), (VLDRS addrmode5:$addr),
               ssub_0)), dsub_0)),
-          RegType)>;
+          dsub_0)>;
 }
 
 defm : Lengthen_Single<"8", "i16", "i8">; // v8i8 -> v8i16
@@ -5636,12 +5712,12 @@ defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
 defm : Lengthen_HalfSingle<"2", "i16", "i8", "8", "i16">; // v2i8 -> v2i16
 defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
 
-// Double lengthening - v4i8 -> v4i16 -> v4i32 
-defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32", qsub_0>;
+// Double lengthening - v4i8 -> v4i16 -> v4i32
+defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">;
 // v2i8 -> v2i16 -> v2i32
-defm : Lengthen_Double<"2", "i32", "i8", "8", "i16", "4", "i32", dsub_0>;
+defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
 // v2i16 -> v2i32 -> v2i64
-defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64", qsub_0>;
+defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
 
 // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
 def : Pat<(v2i64 (extloadvi8 addrmode5:$addr)),
@@ -5911,7 +5987,7 @@ def : NEONInstAlias<"vshl${p}.u32 $Vdn, $Vm",
 def : NEONInstAlias<"vshl${p}.u64 $Vdn, $Vm",
                     (VSHLuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
 
-// VSHL (immediate) two-operand aliases.
+// VSHR (immediate) two-operand aliases.
 def : NEONInstAlias<"vshr${p}.s8 $Vdn, $imm",
                     (VSHRsv8i8 DPR:$Vdn, DPR:$Vdn, shr_imm8:$imm, pred:$p)>;
 def : NEONInstAlias<"vshr${p}.s16 $Vdn, $imm",
@@ -5948,6 +6024,41 @@ def : NEONInstAlias<"vshr${p}.u32 $Vdn, $imm",
 def : NEONInstAlias<"vshr${p}.u64 $Vdn, $imm",
                     (VSHRuv2i64 QPR:$Vdn, QPR:$Vdn, shr_imm64:$imm, pred:$p)>;
 
+// VRSHL two-operand aliases.
+def : NEONInstAlias<"vrshl${p}.s8 $Vdn, $Vm",
+                    (VRSHLsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.s16 $Vdn, $Vm",
+                    (VRSHLsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.s32 $Vdn, $Vm",
+                    (VRSHLsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.s64 $Vdn, $Vm",
+                    (VRSHLsv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.u8 $Vdn, $Vm",
+                    (VRSHLuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.u16 $Vdn, $Vm",
+                    (VRSHLuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.u32 $Vdn, $Vm",
+                    (VRSHLuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.u64 $Vdn, $Vm",
+                    (VRSHLuv1i64 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vrshl${p}.s8 $Vdn, $Vm",
+                    (VRSHLsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.s16 $Vdn, $Vm",
+                    (VRSHLsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.s32 $Vdn, $Vm",
+                    (VRSHLsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.s64 $Vdn, $Vm",
+                    (VRSHLsv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.u8 $Vdn, $Vm",
+                    (VRSHLuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.u16 $Vdn, $Vm",
+                    (VRSHLuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.u32 $Vdn, $Vm",
+                    (VRSHLuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vrshl${p}.u64 $Vdn, $Vm",
+                    (VRSHLuv2i64 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
 // VLD1 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr",
@@ -6911,6 +7022,100 @@ def : NEONInstAlias<"vsli${p}.32 $Vdm, $imm",
 def : NEONInstAlias<"vsli${p}.64 $Vdm, $imm",
                     (VSLIv2i64 QPR:$Vdm, QPR:$Vdm, shr_imm64:$imm, pred:$p)>;
 
+// Two-operand variants for VHSUB.
+    // Signed.
+def : NEONInstAlias<"vhsub${p}.s8 $Vdn, $Vm",
+                    (VHSUBsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhsub${p}.s16 $Vdn, $Vm",
+                    (VHSUBsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhsub${p}.s32 $Vdn, $Vm",
+                    (VHSUBsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vhsub${p}.s8 $Vdn, $Vm",
+                    (VHSUBsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhsub${p}.s16 $Vdn, $Vm",
+                    (VHSUBsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhsub${p}.s32 $Vdn, $Vm",
+                    (VHSUBsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+    // Unsigned.
+def : NEONInstAlias<"vhsub${p}.u8 $Vdn, $Vm",
+                    (VHSUBuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhsub${p}.u16 $Vdn, $Vm",
+                    (VHSUBuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhsub${p}.u32 $Vdn, $Vm",
+                    (VHSUBuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vhsub${p}.u8 $Vdn, $Vm",
+                    (VHSUBuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhsub${p}.u16 $Vdn, $Vm",
+                    (VHSUBuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhsub${p}.u32 $Vdn, $Vm",
+                    (VHSUBuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+
+// Two-operand variants for VHADD.
+    // Signed.
+def : NEONInstAlias<"vhadd${p}.s8 $Vdn, $Vm",
+                    (VHADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhadd${p}.s16 $Vdn, $Vm",
+                    (VHADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhadd${p}.s32 $Vdn, $Vm",
+                    (VHADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vhadd${p}.s8 $Vdn, $Vm",
+                    (VHADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhadd${p}.s16 $Vdn, $Vm",
+                    (VHADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhadd${p}.s32 $Vdn, $Vm",
+                    (VHADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+    // Unsigned.
+def : NEONInstAlias<"vhadd${p}.u8 $Vdn, $Vm",
+                    (VHADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhadd${p}.u16 $Vdn, $Vm",
+                    (VHADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhadd${p}.u32 $Vdn, $Vm",
+                    (VHADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+
+def : NEONInstAlias<"vhadd${p}.u8 $Vdn, $Vm",
+                    (VHADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhadd${p}.u16 $Vdn, $Vm",
+                    (VHADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vhadd${p}.u32 $Vdn, $Vm",
+                    (VHADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+// Two-operand variants for VRHADD.
+    // Signed.
+def : NEONInstAlias<"vrhadd${p}.s8 $Vdn, $Rm",
+                    (VRHADDsv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
+def : NEONInstAlias<"vrhadd${p}.s16 $Vdn, $Rm",
+                    (VRHADDsv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
+def : NEONInstAlias<"vrhadd${p}.s32 $Vdn, $Rm",
+                    (VRHADDsv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
+
+def : NEONInstAlias<"vrhadd${p}.s8 $Vdn, $Rm",
+                    (VRHADDsv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
+def : NEONInstAlias<"vrhadd${p}.s16 $Vdn, $Rm",
+                    (VRHADDsv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
+def : NEONInstAlias<"vrhadd${p}.s32 $Vdn, $Rm",
+                    (VRHADDsv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
+
+    // Unsigned.
+def : NEONInstAlias<"vrhadd${p}.u8 $Vdn, $Rm",
+                    (VRHADDuv8i8 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
+def : NEONInstAlias<"vrhadd${p}.u16 $Vdn, $Rm",
+                    (VRHADDuv4i16 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
+def : NEONInstAlias<"vrhadd${p}.u32 $Vdn, $Rm",
+                    (VRHADDuv2i32 DPR:$Vdn, DPR:$Vdn, DPR:$Rm, pred:$p)>;
+
+def : NEONInstAlias<"vrhadd${p}.u8 $Vdn, $Rm",
+                    (VRHADDuv16i8 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
+def : NEONInstAlias<"vrhadd${p}.u16 $Vdn, $Rm",
+                    (VRHADDuv8i16 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
+def : NEONInstAlias<"vrhadd${p}.u32 $Vdn, $Rm",
+                    (VRHADDuv4i32 QPR:$Vdn, QPR:$Vdn, QPR:$Rm, pred:$p)>;
+
 // VSWP allows, but does not require, a type suffix.
 defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
                          (VSWPd DPR:$Vd, DPR:$Vm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index ba1791b..6335229 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -91,6 +91,12 @@ def t_imm0_508s4 : Operand<i32> {
   let ParserMatchClass = t_imm0_508s4_asmoperand;
   let OperandType = "OPERAND_IMMEDIATE";
 }
+// Alias use only, so no printer is necessary.
+def t_imm0_508s4_neg_asmoperand: AsmOperandClass { let Name = "Imm0_508s4Neg"; }
+def t_imm0_508s4_neg : Operand<i32> {
+  let ParserMatchClass = t_imm0_508s4_neg_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
 
 // Define Thumb specific addressing modes.
 
@@ -345,6 +351,11 @@ def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
   let DecoderMethod = "DecodeThumbAddSPImm";
 }
 
+def : tInstAlias<"add${p} sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+def : tInstAlias<"add${p} sp, sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+
 // Can optionally specify SP as a three operand instruction.
 def : tInstAlias<"add${p} sp, sp, $imm",
                  (tADDspi SP, t_imm0_508s4:$imm, pred:$p)>;
@@ -405,14 +416,13 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
 // prevent stack-pointer assignments that appear immediately before calls from
 // potentially appearing dead.
 let isCall = 1,
-  // On non-IOS platforms R9 is callee-saved.
   Defs = [LR], Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
                   (outs), (ins pred:$p, t_bltarget:$func, variable_ops), IIC_Br,
                   "bl${p}\t$func",
                   [(ARMtcall tglobaladdr:$func)]>,
-             Requires<[IsThumb, IsNotIOS]> {
+             Requires<[IsThumb]> {
     bits<22> func;
     let Inst{26} = func{21};
     let Inst{25-16} = func{20-11};
@@ -426,7 +436,7 @@ let isCall = 1,
                  (outs), (ins pred:$p, t_blxtarget:$func, variable_ops), IIC_Br,
                    "blx${p}\t$func",
                    [(ARMcall tglobaladdr:$func)]>,
-              Requires<[IsThumb, HasV5T, IsNotIOS]> {
+              Requires<[IsThumb, HasV5T]> {
     bits<21> func;
     let Inst{25-16} = func{20-11};
     let Inst{13} = 1;
@@ -439,7 +449,7 @@ let isCall = 1,
   def tBLXr : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br,
                   "blx${p}\t$func",
                   [(ARMtcall GPR:$func)]>,
-              Requires<[IsThumb, HasV5T, IsNotIOS]>,
+              Requires<[IsThumb, HasV5T]>,
               T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24;
     bits<4> func;
     let Inst{6-3} = func;
@@ -450,37 +460,7 @@ let isCall = 1,
   def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
                   4, IIC_Br,
                   [(ARMcall_nolink tGPR:$func)]>,
-            Requires<[IsThumb, IsThumb1Only, IsNotIOS]>;
-}
-
-let isCall = 1,
-  // On IOS R9 is call-clobbered.
-  // R7 is marked as a use to prevent frame-pointer assignments from being
-  // moved above / below calls.
-  Defs = [LR], Uses = [R7, SP] in {
-  // Also used for Thumb2
-  def tBLr9 : tPseudoExpand<(outs), (ins pred:$p, t_bltarget:$func, variable_ops),
-                          4, IIC_Br, [(ARMtcall tglobaladdr:$func)],
-                          (tBL pred:$p, t_bltarget:$func)>,
-              Requires<[IsThumb, IsIOS]>;
-
-  // ARMv5T and above, also used for Thumb2
-  def tBLXi_r9 : tPseudoExpand<(outs), (ins pred:$p, t_blxtarget:$func, variable_ops),
-                      4, IIC_Br, [(ARMcall tglobaladdr:$func)],
-                      (tBLXi pred:$p, t_blxtarget:$func)>,
-                 Requires<[IsThumb, HasV5T, IsIOS]>;
-
-  // Also used for Thumb2
-  def tBLXr_r9 : tPseudoExpand<(outs), (ins pred:$p, GPR:$func, variable_ops),
-                    2, IIC_Br, [(ARMtcall GPR:$func)],
-                    (tBLXr pred:$p, GPR:$func)>,
-                 Requires<[IsThumb, HasV5T, IsIOS]>;
-
-  // ARMv4T
-  def tBXr9_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
-                   4, IIC_Br,
-                   [(ARMcall_nolink tGPR:$func)]>,
-              Requires<[IsThumb, IsThumb1Only, IsIOS]>;
+            Requires<[IsThumb, IsThumb1Only]>;
 }
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
@@ -524,24 +504,20 @@ let isBranch = 1, isTerminator = 1 in
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS versions.
   let Uses = [SP] in {
-    // tTAILJMPd: IOS version uses a Thumb2 branch (no Thumb1 tail calls
-    // on IOS), so it's in ARMInstrThumb2.td.
     def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
                      4, IIC_Br, [],
                      (tBX GPR:$dst, (ops 14, zero_reg))>,
-                     Requires<[IsThumb, IsIOS]>;
+                     Requires<[IsThumb]>;
   }
-  // Non-IOS versions (the difference is R9).
+  // tTAILJMPd: IOS version uses a Thumb2 branch (no Thumb1 tail calls
+  // on IOS), so it's in ARMInstrThumb2.td.
+  // Non-IOS version:
   let Uses = [SP] in {
     def tTAILJMPdND : tPseudoExpand<(outs),
                    (ins t_brtarget:$dst, pred:$p, variable_ops),
                    4, IIC_Br, [],
                    (tB t_brtarget:$dst, pred:$p)>,
                  Requires<[IsThumb, IsNotIOS]>;
-    def tTAILJMPrND : tPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
-                     4, IIC_Br, [],
-                     (tBX GPR:$dst, (ops 14, zero_reg))>,
-                     Requires<[IsThumb, IsNotIOS]>;
   }
 }
 
@@ -1307,20 +1283,14 @@ def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
 
 // Direct calls
 def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>,
-      Requires<[IsThumb, IsNotIOS]>;
-def : T1Pat<(ARMtcall texternalsym:$func), (tBLr9 texternalsym:$func)>,
-      Requires<[IsThumb, IsIOS]>;
+      Requires<[IsThumb]>;
 
 def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>,
-      Requires<[IsThumb, HasV5T, IsNotIOS]>;
-def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi_r9 texternalsym:$func)>,
-      Requires<[IsThumb, HasV5T, IsIOS]>;
+      Requires<[IsThumb, HasV5T]>;
 
 // Indirect calls to ARM routines
 def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
-      Requires<[IsThumb, HasV5T, IsNotIOS]>;
-def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr_r9 GPR:$dst)>,
-      Requires<[IsThumb, HasV5T, IsIOS]>;
+      Requires<[IsThumb, HasV5T]>;
 
 // zextload i1 -> zextload i8
 def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr),
@@ -1437,3 +1407,11 @@ def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
 def : tInstAlias<"neg${s}${p} $Rd, $Rm",
                  (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
 
+
+// Implied destination operand forms for shifts.
+def : tInstAlias<"lsl${s}${p} $Rdm, $imm",
+             (tLSLri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm0_31:$imm, pred:$p)>;
+def : tInstAlias<"lsr${s}${p} $Rdm, $imm",
+             (tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
+def : tInstAlias<"asr${s}${p} $Rdm, $imm",
+             (tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 1f7edc1..e6fb9d5 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -89,20 +89,26 @@ def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{
 // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
 def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
 def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
-  return ARM_AM::getT2SOImmVal(-((uint32_t)N->getZExtValue())) != -1;
+  int64_t Value = -(int)N->getZExtValue();
+  return Value && ARM_AM::getT2SOImmVal(Value) != -1;
 }], t2_so_imm_neg_XFORM> {
   let ParserMatchClass = t2_so_imm_neg_asmoperand;
 }
 
 /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
-def imm0_4095 : Operand<i32>,
-                ImmLeaf<i32, [{
+def imm0_4095_asmoperand: ImmAsmOperand { let Name = "Imm0_4095"; }
+def imm0_4095 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 4096;
-}]>;
+}]> {
+  let ParserMatchClass = imm0_4095_asmoperand;
+}
 
-def imm0_4095_neg : PatLeaf<(i32 imm), [{
+def imm0_4095_neg_asmoperand: AsmOperandClass { let Name = "Imm0_4095Neg"; }
+def imm0_4095_neg : Operand<i32>, PatLeaf<(i32 imm), [{
  return (uint32_t)(-N->getZExtValue()) < 4096;
-}], imm_neg_XFORM>;
+}], imm_neg_XFORM> {
+  let ParserMatchClass = imm0_4095_neg_asmoperand;
+}
 
 def imm0_255_neg : PatLeaf<(i32 imm), [{
   return (uint32_t)(-N->getZExtValue()) < 255;
@@ -2871,6 +2877,8 @@ defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
 // a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
+
+let isCommutable = 1 in
 def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
                             (ins rGPR:$false, rGPR:$Rm, pred:$p),
                             4, IIC_iCMOVr,
@@ -3189,6 +3197,7 @@ def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
   let Inst{13} = target{17};
   let Inst{21-16} = target{16-11};
   let Inst{10-0} = target{10-0};
+  let DecoderMethod = "DecodeT2BInstruction";
 }
 
 let isNotDuplicable = 1, isIndirectBranch = 1 in {
@@ -3268,37 +3277,19 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                  Requires<[IsThumb2, IsIOS]>;
 }
 
-let isCall = 1,
-  // On non-IOS platforms R9 is callee-saved.
-  Defs = [LR], Uses = [SP] in {
+let isCall = 1, Defs = [LR], Uses = [SP] in {
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
   def t2BMOVPCB_CALL : tPseudoInst<(outs),
                                    (ins t_bltarget:$func, variable_ops),
                                6, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
-                        Requires<[IsThumb, IsNotIOS]>;
-}
-
-let isCall = 1,
-  // On IOS R9 is call-clobbered.
-  // R7 is marked as a use to prevent frame-pointer assignments from being
-  // moved above / below calls.
-  Defs = [LR], Uses = [R7, SP] in {
-  // mov lr, pc; b if callee is marked noreturn to avoid confusing the
-  // return stack predictor.
-  def t2BMOVPCBr9_CALL : tPseudoInst<(outs),
-                                     (ins t_bltarget:$func, variable_ops),
-                               6, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
-                        Requires<[IsThumb, IsIOS]>;
+                        Requires<[IsThumb]>;
 }
 
 // Direct calls
 def : T2Pat<(ARMcall_nolink texternalsym:$func),
             (t2BMOVPCB_CALL texternalsym:$func)>,
-      Requires<[IsThumb, IsNotIOS]>;
-def : T2Pat<(ARMcall_nolink texternalsym:$func),
-            (t2BMOVPCBr9_CALL texternalsym:$func)>,
-      Requires<[IsThumb, IsIOS]>;
+      Requires<[IsThumb]>;
 
 // IT block
 let Defs = [ITSTATE] in
@@ -3966,6 +3957,19 @@ def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
                   (t2ADDrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
                            pred:$p, cc_out:$s)>;
 
+// add w/ negative immediates is just a sub.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+                 cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+               cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+
 // Aliases for SUB without the ".w" optional width specifier.
 def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
         (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
@@ -3981,13 +3985,14 @@ def : t2InstAlias<"sub${s}${p} $Rdn, $imm",
       (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
 def : t2InstAlias<"sub${p} $Rdn, $imm",
            (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p}.w $Rdn, $Rm",
+            (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
 def : t2InstAlias<"sub${s}${p} $Rdn, $Rm",
             (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
 def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm",
                   (t2SUBrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
                            pred:$p, cc_out:$s)>;
 
-
 // Alias for compares without the ".w" optional width specifier.
 def : t2InstAlias<"cmn${p} $Rn, $Rm",
                   (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index e9d5720..3600b88 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -950,7 +950,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,UseFPVMLx,NoVFP4]>;
+              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
 
 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -958,7 +958,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]> {
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -966,10 +966,10 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,NoVFP4]>;
+          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,NoVFP4]>;
+          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
 
 def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -977,7 +977,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,UseFPVMLx,NoVFP4]>;
+              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
 
 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -985,7 +985,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]> {
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -993,10 +993,10 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,NoVFP4]>;
+          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1004,7 +1004,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP2,UseFPVMLx,NoVFP4]>;
+                Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1012,7 +1012,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]> {
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1020,10 +1020,10 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,NoVFP4]>;
+          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1031,14 +1031,14 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP2,UseFPVMLx,NoVFP4]>;
+               Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
                   IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
              [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
-                  Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]> {
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1046,10 +1046,10 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,NoVFP4]>;
+          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,NoVFP4]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
 
 //===----------------------------------------------------------------------===//
 // Fused FP Multiply-Accumulate Operations.
@@ -1060,7 +1060,7 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,FPContractions]>;
+              Requires<[HasVFP4,UseFusedMAC]>;
 
 def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1068,17 +1068,25 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
                   [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP4,DontUseNEONForFP,FPContractions]> {
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines.
 }
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,FPContractions]>;
+          Requires<[HasVFP4,UseFusedMAC]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP4,DontUseNEONForFP,FPContractions]>;
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+def : Pat<(f64 (fma DPR:$Ddin, DPR:$Dn, DPR:$Dm)),
+          (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma SPR:$Sdin, SPR:$Sn, SPR:$Sm)),
+          (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
 
 def VFMSD : ADbI<0b11101, 0b10, 1, 0,
                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1086,7 +1094,7 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,FPContractions]>;
+              Requires<[HasVFP4,UseFusedMAC]>;
 
 def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1094,17 +1102,33 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
                   [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP4,DontUseNEONForFP,FPContractions]> {
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines.
 }
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,FPContractions]>;
+          Requires<[HasVFP4,UseFusedMAC]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP4,DontUseNEONForFP,FPContractions]>;
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fma (fneg x), y, z) -> (vfms x, y, z)
+def : Pat<(f64 (fma (fneg DPR:$Ddin), DPR:$Dn, DPR:$Dm)),
+          (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma (fneg SPR:$Sdin), SPR:$Sn, SPR:$Sm)),
+          (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fneg (fma x, (fneg y), z) -> (vfms x, y, z)
+def : Pat<(fneg (f64 (fma DPR:$Ddin, (fneg DPR:$Dn), DPR:$Dm))),
+          (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(fneg (f32 (fma SPR:$Sdin, (fneg SPR:$Sn), SPR:$Sm))),
+          (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
 
 def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1112,7 +1136,7 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP4,FPContractions]>;
+                Requires<[HasVFP4,UseFusedMAC]>;
 
 def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1120,17 +1144,33 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
                   [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP4,DontUseNEONForFP,FPContractions]> {
+                Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines.
 }
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,FPContractions]>;
+          Requires<[HasVFP4,UseFusedMAC]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP4,DontUseNEONForFP,FPContractions]>;
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fneg (fma x, y, z)) -> (vfnma x, y, z)
+def : Pat<(fneg (fma (f64 DPR:$Ddin), (f64 DPR:$Dn), (f64 DPR:$Dm))),
+          (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(fneg (fma (f32 SPR:$Sdin), (f32 SPR:$Sn), (f32 SPR:$Sm))),
+          (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fma (fneg x), y, (fneg z)) -> (vfnma x, y, z)
+def : Pat<(f64 (fma (fneg DPR:$Ddin), DPR:$Dn, (fneg DPR:$Dm))),
+          (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma (fneg SPR:$Sdin), SPR:$Sn, (fneg SPR:$Sm))),
+          (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
 
 def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1138,24 +1178,40 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP4,FPContractions]>;
+               Requires<[HasVFP4,UseFusedMAC]>;
 
 def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
                   IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm",
              [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
-                  Requires<[HasVFP4,DontUseNEONForFP,FPContractions]> {
+                  Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines.
 }
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,FPContractions]>;
+          Requires<[HasVFP4,UseFusedMAC]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP4,DontUseNEONForFP,FPContractions]>;
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fneg (fma (fneg x), y, z)) -> (vnfms x, y, z)
+def : Pat<(fneg (f64 (fma (fneg DPR:$Ddin), DPR:$Dn, DPR:$Dm))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(fneg (f32 (fma (fneg SPR:$Sdin), SPR:$Sn, SPR:$Sm))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fma x, (fneg y), z) -> (vnfms x, y, z)
+def : Pat<(f64 (fma DPR:$Ddin, (fneg DPR:$Dn), DPR:$Dm)),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma SPR:$Sdin, (fneg SPR:$Sn), SPR:$Sm)),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
 
 //===----------------------------------------------------------------------===//
 // FP Conditional moves.
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index 753e578..c5db211 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -13,7 +13,7 @@
 
 #define DEBUG_TYPE "jit"
 #include "ARMJITInfo.h"
-#include "ARMInstrInfo.h"
+#include "ARM.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMRelocations.h"
 #include "ARMSubtarget.h"
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 0f6dc04..9ef2ace 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -15,8 +15,8 @@
 #define DEBUG_TYPE "arm-ldst-opt"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
-#include "ARMRegisterInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -93,7 +93,9 @@ namespace {
     bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                   int Offset, unsigned Base, bool BaseKill, int Opcode,
                   ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
-                  DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs);
+                  DebugLoc dl,
+                  ArrayRef<std::pair<unsigned, bool> > Regs,
+                  ArrayRef<unsigned> ImpDefs);
     void MergeOpsUpdate(MachineBasicBlock &MBB,
                         MemOpQueue &MemOps,
                         unsigned memOpsBegin,
@@ -282,7 +284,8 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
                           int Offset, unsigned Base, bool BaseKill,
                           int Opcode, ARMCC::CondCodes Pred,
                           unsigned PredReg, unsigned Scratch, DebugLoc dl,
-                          SmallVector<std::pair<unsigned, bool>, 8> &Regs) {
+                          ArrayRef<std::pair<unsigned, bool> > Regs,
+                          ArrayRef<unsigned> ImpDefs) {
   // Only a single register to load / store. Don't bother.
   unsigned NumRegs = Regs.size();
   if (NumRegs <= 1)
@@ -350,6 +353,10 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
     MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
                      | getKillRegState(Regs[i].second));
 
+  // Add implicit defs for super-registers.
+  for (unsigned i = 0, e = ImpDefs.size(); i != e; ++i)
+    MIB.addReg(ImpDefs[i], RegState::ImplicitDefine);
+
   return true;
 }
 
@@ -384,19 +391,29 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
   }
 
   SmallVector<std::pair<unsigned, bool>, 8> Regs;
+  SmallVector<unsigned, 8> ImpDefs;
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     unsigned Reg = memOps[i].Reg;
     // If we are inserting the merged operation after an operation that
     // uses the same register, make sure to transfer any kill flag.
     bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
     Regs.push_back(std::make_pair(Reg, isKill));
+
+    // Collect any implicit defs of super-registers. They must be preserved.
+    for (MIOperands MO(memOps[i].MBBI); MO.isValid(); ++MO) {
+      if (!MO->isReg() || !MO->isDef() || !MO->isImplicit() || MO->isDead())
+        continue;
+      unsigned DefReg = MO->getReg();
+      if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
+        ImpDefs.push_back(DefReg);
+    }
   }
 
   // Try to do the merge.
   MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
   ++Loc;
   if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
-                Pred, PredReg, Scratch, dl, Regs))
+                Pred, PredReg, Scratch, dl, Regs, ImpDefs))
     return;
 
   // Merge succeeded, update records.
@@ -537,7 +554,7 @@ static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
   if (!(MI->getOperand(0).getReg() == Base &&
         MI->getOperand(1).getReg() == Base &&
         (MI->getOperand(2).getImm()*Scale) == Bytes &&
-        llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
         MyPredReg == PredReg))
     return false;
 
@@ -570,7 +587,7 @@ static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
   if (!(MI->getOperand(0).getReg() == Base &&
         MI->getOperand(1).getReg() == Base &&
         (MI->getOperand(2).getImm()*Scale) == Bytes &&
-        llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
         MyPredReg == PredReg))
     return false;
 
@@ -701,7 +718,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
   bool BaseKill = MI->getOperand(0).isKill();
   unsigned Bytes = getLSMultipleTransferSize(MI);
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   int Opcode = MI->getOpcode();
   DebugLoc dl = MI->getDebugLoc();
 
@@ -854,7 +871,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     return false;
 
   unsigned PredReg = 0;
-  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   bool DoMerge = false;
   ARM_AM::AddrOpc AddSub = ARM_AM::add;
   unsigned NewOpc = 0;
@@ -1112,7 +1129,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
     bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
     int OffImm = getMemoryOpOffset(MI);
     unsigned PredReg = 0;
-    ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
+    ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
 
     if (OddRegNum > EvenRegNum && OffImm == 0) {
       // Ascending register numbers and no offset. It's safe to change it to a
@@ -1143,6 +1160,11 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
       unsigned NewOpc = (isLd)
         ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
         : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
+      // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
+      // so adjust and use t2LDRi12 here for that.
+      unsigned NewOpc2 = (isLd)
+        ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+        : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
       DebugLoc dl = MBBI->getDebugLoc();
       // If this is a load and base register is killed, it may have been
       // re-defed by the load, make sure the first load does not clobber it.
@@ -1150,11 +1172,13 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
           (BaseKill || OffKill) &&
           (TRI->regsOverlap(EvenReg, BaseReg))) {
         assert(!TRI->regsOverlap(OddReg, BaseReg));
-        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
+        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
                       OddReg, OddDeadKill, false,
                       BaseReg, false, BaseUndef, false, OffUndef,
                       Pred, PredReg, TII, isT2);
         NewBBI = llvm::prior(MBBI);
+        if (isT2 && NewOpc == ARM::t2LDRi8 && OffImm+4 >= 0)
+          NewOpc = ARM::t2LDRi12;
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                       EvenReg, EvenDeadKill, false,
                       BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
@@ -1167,12 +1191,16 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
           EvenDeadKill = false;
           OddDeadKill = true;
         }
+        // Never kill the base register in the first instruction.
+        // <rdar://problem/11101911>
+        if (EvenReg == BaseReg)
+          EvenDeadKill = false;
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
                       EvenReg, EvenDeadKill, EvenUndef,
                       BaseReg, false, BaseUndef, false, OffUndef,
                       Pred, PredReg, TII, isT2);
         NewBBI = llvm::prior(MBBI);
-        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
+        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
                       OddReg, OddDeadKill, OddUndef,
                       BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
                       Pred, PredReg, TII, isT2);
@@ -1223,7 +1251,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       bool isKill = MO.isDef() ? false : MO.isKill();
       unsigned Base = MBBI->getOperand(1).getReg();
       unsigned PredReg = 0;
-      ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
+      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
       int Offset = getMemoryOpOffset(MBBI);
       // Watch out for:
       // r4 := ldr [r5]
@@ -1599,7 +1627,7 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   if (EvenReg == OddReg)
     return false;
   BaseReg = Op0->getOperand(1).getReg();
-  Pred = llvm::getInstrPredicate(Op0, PredReg);
+  Pred = getInstrPredicate(Op0, PredReg);
   dl = Op0->getDebugLoc();
   return true;
 }
@@ -1796,7 +1824,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
       if (!isMemoryOp(MI))
         continue;
       unsigned PredReg = 0;
-      if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL)
+      if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
         continue;
 
       int Opc = MI->getOpcode();
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 1327fb8..1466e98 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -314,7 +314,8 @@ def TuplesOE2D : RegisterTuples<[dsub_0, dsub_1],
 def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
                           128, (interleave QPR, TuplesOE2D)> {
   // Allocate starting at non-VFP2 registers D16-D31 first.
-  let AltOrders = [(rotl DPair, 16)];
+  // Prefer even-odd pairs as they are easier to copy.
+  let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))];
   let AltOrderSelect = [{ return 1; }];
 }
 
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td
index 8d86c01..8b1fb93 100644
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -324,6 +324,15 @@ def CortexA8Itineraries : ProcessorItineraries<
                                InstrStage<19, [A8_NPipe], 0>,
                                InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
   //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
   // Single-precision FP DIV
   InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<20, [A8_NPipe], 0>,
@@ -860,6 +869,16 @@ def CortexA8Itineraries : ProcessorItineraries<
   InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
   //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
   // Double-register Reciprical Step
   InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 49fedf6..0d710cc 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -604,6 +604,22 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<2,  [A9_NPipe]>],
                               [9, 1, 1, 1]>,
   //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
+  //
   // Single-precision FP DIV
   InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1,  [A9_MUX0], 0>,
@@ -1697,6 +1713,26 @@ def CortexA9Itineraries : ProcessorItineraries<
                                InstrStage<4, [A9_NPipe]>],
                               [8, 4, 2, 1]>,
   //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
+  //
   // Double-register Reciprical Step
   InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                InstrStage<1, [A9_MUX0], 0>,
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index 4d959f5..0ace9bc 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -243,6 +243,12 @@ def ARMV6Itineraries : ProcessorItineraries<
   // Double-precision FP MAC
   InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
   //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
   // Single-precision FP DIV
   InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
   //
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 1e8cda5..ca172ed 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -16,7 +16,6 @@
 #include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/ADT/SmallVector.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -49,7 +48,6 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   , HasVFPv3(false)
   , HasVFPv4(false)
   , HasNEON(false)
-  , HasNEON2(false)
   , UseNEONForSinglePrecisionFP(false)
   , SlowFPVMLx(false)
   , HasVMLxForwarding(false)
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 3d9c03d..e72b06f 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -45,13 +45,12 @@ protected:
   bool HasV6T2Ops;
   bool HasV7Ops;
 
-  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON - Specify what
   /// floating point ISAs are supported.
   bool HasVFPv2;
   bool HasVFPv3;
   bool HasVFPv4;
   bool HasNEON;
-  bool HasNEON2;
 
   /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
   /// specified. Use the method useNEONForSinglePrecisionFP() to
@@ -205,7 +204,6 @@ protected:
   bool hasVFP3() const { return HasVFPv3; }
   bool hasVFP4() const { return HasVFPv4; }
   bool hasNEON() const { return HasNEON;  }
-  bool hasNEON2() const { return HasNEON2 || (HasNEON && HasVFPv4);  }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP; }
 
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 44229ad..047efc2 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -158,8 +158,10 @@ bool ARMPassConfig::addPreRegAlloc() {
 bool ARMPassConfig::addPreSched2() {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None) {
-    if (!getARMSubtarget().isThumb1Only())
+    if (!getARMSubtarget().isThumb1Only()) {
       PM.add(createARMLoadStoreOptimizationPass());
+      printAndVerify("After ARM load / store optimizer");
+    }
     if (getARMSubtarget().hasNEON())
       PM.add(createExecutionDependencyFixPass(&ARM::DPRRegClass));
   }
@@ -192,7 +194,8 @@ bool ARMPassConfig::addPreEmitPass() {
   return true;
 }
 
-bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) {
+bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                          JITCodeEmitter &JCE) {
   // Machine code emitter pass for ARM.
   PM.add(createARMJITCodeEmitterPass(*this, JCE));
   return false;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp b/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
index eb8aaf2..fda8536 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
@@ -17,8 +17,6 @@
 
 #include "llvm/Support/TargetRegistry.h"
 
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 
 #include <string>
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 911eb13..2c53e3f 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -82,8 +82,14 @@ class ARMAsmParser : public MCTargetAsmParser {
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  bool Warning(SMLoc L, const Twine &Msg,
+               ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+    return Parser.Warning(L, Msg, Ranges);
+  }
+  bool Error(SMLoc L, const Twine &Msg,
+             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+    return Parser.Error(L, Msg, Ranges);
+  }
 
   int tryParseRegister();
   bool tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
@@ -478,6 +484,8 @@ public:
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
 
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
   ARMCC::CondCodes getCondCode() const {
     assert(Kind == k_CondCode && "Invalid access!");
     return CC.Val;
@@ -579,6 +587,14 @@ public:
     int64_t Value = CE->getValue();
     return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
   }
+  bool isImm0_508s4Neg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = -CE->getValue();
+    // explicitly exclude zero. we want that to use the normal 0_508 version.
+    return ((Value & 3) == 0) && Value > 0 && Value <= 508;
+  }
   bool isImm0_255() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -586,6 +602,20 @@ public:
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 256;
   }
+  bool isImm0_4095() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 4096;
+  }
+  bool isImm0_4095Neg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = -CE->getValue();
+    return Value > 0 && Value < 4096;
+  }
   bool isImm0_1() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -782,7 +812,9 @@ public:
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return ARM_AM::getSOImmVal(-Value) != -1;
+    // Only use this when not representable as a plain so_imm.
+    return ARM_AM::getSOImmVal(Value) == -1 &&
+      ARM_AM::getSOImmVal(-Value) != -1;
   }
   bool isT2SOImm() const {
     if (!isImm()) return false;
@@ -803,7 +835,9 @@ public:
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return ARM_AM::getT2SOImmVal(-Value) != -1;
+    // Only use this when not representable as a plain so_imm.
+    return ARM_AM::getT2SOImmVal(Value) == -1 &&
+      ARM_AM::getT2SOImmVal(-Value) != -1;
   }
   bool isSetEndImm() const {
     if (!isImm()) return false;
@@ -1495,6 +1529,14 @@ public:
     Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4));
   }
 
+  void addImm0_508s4NegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-(CE->getValue() / 4)));
+  }
+
   void addImm0_508s4Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate is scaled by four in the encoding and is stored
@@ -1553,6 +1595,14 @@ public:
     Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
   }
 
+  void addImm0_4095NegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually an imm0_4095, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
+  }
+
   void addARMSOImmNotOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The operand is actually a so_imm, but we have its bitwise
@@ -3324,7 +3374,8 @@ parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
         FlagsVal = 8; // No flag
     }
   } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
-    if (Flags == "all") // cpsr_all is an alias for cpsr_fc
+    // cpsr_all is an alias for cpsr_fc, as is plain cpsr.
+    if (Flags == "all" || Flags == "")
       Flags = "fc";
     for (int i = 0, e = Flags.size(); i != e; ++i) {
       unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1))
@@ -4475,22 +4526,26 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   case AsmToken::Dollar:
   case AsmToken::Hash: {
     // #42 -> immediate.
-    // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
     S = Parser.getTok().getLoc();
     Parser.Lex();
-    bool isNegative = Parser.getTok().is(AsmToken::Minus);
-    const MCExpr *ImmVal;
-    if (getParser().ParseExpression(ImmVal))
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (CE) {
-      int32_t Val = CE->getValue();
-      if (isNegative && Val == 0)
-        ImmVal = MCConstantExpr::Create(INT32_MIN, getContext());
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      bool isNegative = Parser.getTok().is(AsmToken::Minus);
+      const MCExpr *ImmVal;
+      if (getParser().ParseExpression(ImmVal))
+        return true;
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
+      if (CE) {
+        int32_t Val = CE->getValue();
+        if (isNegative && Val == 0)
+          ImmVal = MCConstantExpr::Create(INT32_MIN, getContext());
+      }
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
+      return false;
     }
-    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
-    return false;
+    // w/ a ':' after the '#', it's just like a plain ':'.
+    // FALLTHROUGH
   }
   case AsmToken::Colon: {
     // ":lower16:" and ":upper16:" expression prefixes
@@ -4616,6 +4671,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
         Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
         Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
         Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
+        Mnemonic == "vfms" || Mnemonic == "vfnms" ||
         (Mnemonic == "movs" && isThumb()))) {
     Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
     CarrySetting = true;
@@ -4659,6 +4715,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
       Mnemonic == "orr" || Mnemonic == "mvn" ||
       Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
       Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
+      Mnemonic == "vfm" || Mnemonic == "vfnm" ||
       (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
                       Mnemonic == "mla" || Mnemonic == "smlal" ||
                       Mnemonic == "umlal" || Mnemonic == "umull"))) {
@@ -4727,7 +4784,7 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
       static_cast<ARMOperand*>(Operands[4])->isReg() &&
       static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::SP &&
       static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
-      (static_cast<ARMOperand*>(Operands[5])->isReg() ||
+      ((Mnemonic == "add" &&static_cast<ARMOperand*>(Operands[5])->isReg()) ||
        static_cast<ARMOperand*>(Operands[5])->isImm0_1020s4()))
     return true;
   // For Thumb2, add/sub immediate does not have a cc_out operand for the
@@ -4811,7 +4868,10 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
       (Operands.size() == 5 || Operands.size() == 6) &&
       static_cast<ARMOperand*>(Operands[3])->isReg() &&
       static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::SP &&
-      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      (static_cast<ARMOperand*>(Operands[4])->isImm() ||
+       (Operands.size() == 6 &&
+        static_cast<ARMOperand*>(Operands[5])->isImm())))
     return true;
 
   return false;
@@ -6602,6 +6662,37 @@ processInstruction(MCInst &Inst,
     return true;
   }
 
+  // Handle encoding choice for the shift-immediate instructions.
+  case ARM::t2LSLri:
+  case ARM::t2LSRri:
+  case ARM::t2ASRri: {
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !(static_cast<ARMOperand*>(Operands[3])->isToken() &&
+         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w")) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case ARM::t2LSLri: NewOpc = ARM::tLSLri; break;
+      case ARM::t2LSRri: NewOpc = ARM::tLSRri; break;
+      case ARM::t2ASRri: NewOpc = ARM::tASRri; break;
+      }
+      // The Thumb1 operands aren't in the same order. Awesome, eh?
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+
   // Handle the Thumb2 mode MOV complex aliases.
   case ARM::t2MOVsr:
   case ARM::t2MOVSsr: {
@@ -6833,7 +6924,7 @@ processInstruction(MCInst &Inst,
     // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
     // to encoding T2 if <Rd> is specified and encoding T2 is preferred
     // to encoding T1 if <Rd> is omitted."
-    if (Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
       Inst.setOpcode(ARM::tADDi3);
       return true;
     }
@@ -6843,11 +6934,37 @@ processInstruction(MCInst &Inst,
     // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
     // to encoding T2 if <Rd> is specified and encoding T2 is preferred
     // to encoding T1 if <Rd> is omitted."
-    if (Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
       Inst.setOpcode(ARM::tSUBi3);
       return true;
     }
     break;
+  case ARM::t2ADDri:
+  case ARM::t2SUBri: {
+    // If the destination and first source operand are the same, and
+    // the flags are compatible with the current IT status, use encoding T2
+    // instead of T3. For compatibility with the system 'as'. Make sure the
+    // wide encoding wasn't explicit.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+        !isARMLowRegister(Inst.getOperand(0).getReg()) ||
+        (unsigned)Inst.getOperand(2).getImm() > 255 ||
+        ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
+        (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
+        (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w"))
+      break;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
+                      ARM::tADDi8 : ARM::tSUBi8);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(5));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
   case ARM::t2ADDrr: {
     // If the destination and first source operand are the same, and
     // there's no setting of the flags, use encoding T2 instead of T3.
@@ -6964,7 +7081,7 @@ processInstruction(MCInst &Inst,
     // If we can use the 16-bit encoding and the user didn't explicitly
     // request the 32-bit variant, transform it here.
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
-        Inst.getOperand(1).getImm() <= 255 &&
+        (unsigned)Inst.getOperand(1).getImm() <= 255 &&
         ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
          Inst.getOperand(4).getReg() == ARM::CPSR) ||
         (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
@@ -7216,7 +7333,8 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
   case Match_MnemonicFail:
-    return Error(IDLoc, "invalid instruction");
+    return Error(IDLoc, "invalid instruction",
+                 ((ARMOperand*)Operands[0])->getLocRange());
   case Match_ConversionFail:
     // The converter function will have already emited a diagnostic.
     return true;
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ce4587b..912935d 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -9,8 +9,6 @@
 
 #define DEBUG_TYPE "arm-disassembler"
 
-#include "ARM.h"
-#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMMCExpr.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -20,6 +18,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -103,228 +102,232 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
 
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
-static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeGPRnopcRegisterClass(llvm::MCInst &Inst,
+static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
                                                unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
-static DecodeStatus DecodetGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodetcGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecoderGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDPR_8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDPR_VFP2RegisterClass(llvm::MCInst &Inst,
+static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst,
                                                 unsigned RegNo,
                                                 uint64_t Address,
                                                 const void *Decoder);
-static DecodeStatus DecodeQPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDPairSpacedRegisterClass(llvm::MCInst &Inst,
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
                                unsigned RegNo, uint64_t Address,
                                const void *Decoder);
 
-static DecodeStatus DecodePredicateOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeCCOutOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSOImmOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeSOImmOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeRegListOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 
-static DecodeStatus DecodeBitfieldMaskOperand(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeCopMemInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeAddrMode2IdxInstruction(llvm::MCInst &Inst,
+static DecodeStatus DecodeAddrMode2IdxInstruction(MCInst &Inst,
                                                   unsigned Insn,
                                                   uint64_t Address,
                                                   const void *Decoder);
-static DecodeStatus DecodeSORegMemOperand(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeAddrMode3Instruction(llvm::MCInst &Inst,unsigned Insn,
+static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst,unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSORegImmOperand(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSORegRegOperand(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 
-static DecodeStatus DecodeMemMultipleWritebackInstruction(llvm::MCInst & Inst,
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst & Inst,
                                                   unsigned Insn,
                                                   uint64_t Adddress,
                                                   const void *Decoder);
-static DecodeStatus DecodeT2MOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeArmMOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSMLAInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeCPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2CPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeAddrModeImm12Operand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeAddrMode5Operand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeAddrMode7Operand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeBranchImmInstruction(llvm::MCInst &Inst,unsigned Insn,
+static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst,unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD3DupInstruction(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD4DupInstruction(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeNEONModImmInstruction(llvm::MCInst &Inst,unsigned Val,
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVSHLMaxInstruction(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftRight8Imm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftRight16Imm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftRight32Imm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftRight64Imm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodePostIdxReg(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeCoprocessor(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMemBarrierOption(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeMSRMask(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDoubleRegLoad(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeDoubleRegStore(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeLDRPreImm(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeLDRPreReg(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSTRPreImm(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSTRPreReg(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD1LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD2LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD3LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVLD4LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVST1LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVST2LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVST3LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVST4LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVMOVSRR(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSwap(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVCVTD(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVCVTQ(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
 
 
-static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn,
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbBROperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2BROperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbCmpBROperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbAddrModeRR(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbAddrModeIS(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbAddrModePC(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbAddrModeSP(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2AddrModeSOReg(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2LoadShift(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2Imm8S4(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2AddrModeImm8s4(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2AddrModeImm0_1020s4(llvm::MCInst &Inst,unsigned Val,
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2Imm8(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2AddrModeImm8(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbAddSPImm(llvm::MCInst &Inst, uint16_t Val,
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbAddSPReg(llvm::MCInst &Inst, uint16_t Insn,
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbCPS(llvm::MCInst &Inst, uint16_t Insn,
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbBLXOffset(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2AddrModeImm12(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbTableBranch(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumb2BCCInstruction(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2SOImm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbBCCTargetOperand(llvm::MCInst &Inst,unsigned Val,
+static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst,unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThumbBLTargetOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeIT(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2LDRDPreInstruction(llvm::MCInst &Inst,unsigned Insn,
+static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst,unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2STRDPreInstruction(llvm::MCInst &Inst,unsigned Insn,
+static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst,unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2Adr(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2LdStPre(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeT2ShifterImmOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
 
-
-
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
 #include "ARMGenDisassemblerTables.inc"
 #include "ARMGenInstrInfo.inc"
 #include "ARMGenEDInfo.inc"
@@ -856,7 +859,7 @@ static const uint16_t GPRDecoderTable[] = {
   ARM::R12, ARM::SP, ARM::LR, ARM::PC
 };
 
-static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
@@ -867,7 +870,7 @@ static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus
-DecodeGPRnopcRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
                            uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   
@@ -879,14 +882,14 @@ DecodeGPRnopcRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return S;
 }
 
-static DecodeStatus DecodetGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
   return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
-static DecodeStatus DecodetcGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   unsigned Register = 0;
   switch (RegNo) {
@@ -916,7 +919,7 @@ static DecodeStatus DecodetcGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecoderGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo == 13 || RegNo == 15) return MCDisassembler::Fail;
   return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
@@ -933,7 +936,7 @@ static const uint16_t SPRDecoderTable[] = {
     ARM::S28, ARM::S29, ARM::S30, ARM::S31
 };
 
-static DecodeStatus DecodeSPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
@@ -954,7 +957,7 @@ static const uint16_t DPRDecoderTable[] = {
     ARM::D28, ARM::D29, ARM::D30, ARM::D31
 };
 
-static DecodeStatus DecodeDPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
@@ -964,7 +967,7 @@ static DecodeStatus DecodeDPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeDPR_8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 7)
     return MCDisassembler::Fail;
@@ -972,7 +975,7 @@ static DecodeStatus DecodeDPR_8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus
-DecodeDPR_VFP2RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
                             uint64_t Address, const void *Decoder) {
   if (RegNo > 15)
     return MCDisassembler::Fail;
@@ -987,7 +990,7 @@ static const uint16_t QPRDecoderTable[] = {
 };
 
 
-static DecodeStatus DecodeQPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
@@ -1007,7 +1010,7 @@ static const uint16_t DPairDecoderTable[] = {
   ARM::Q15
 };
 
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 30)
     return MCDisassembler::Fail;
@@ -1028,7 +1031,7 @@ static const uint16_t DPairSpacedDecoderTable[] = {
   ARM::D28_D30, ARM::D29_D31
 };
 
-static DecodeStatus DecodeDPairSpacedRegisterClass(llvm::MCInst &Inst,
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
                                                    unsigned RegNo,
                                                    uint64_t Address,
                                                    const void *Decoder) {
@@ -1040,7 +1043,7 @@ static DecodeStatus DecodeDPairSpacedRegisterClass(llvm::MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodePredicateOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   if (Val == 0xF) return MCDisassembler::Fail;
   // AL predicate is not allowed on Thumb1 branches.
@@ -1054,7 +1057,7 @@ static DecodeStatus DecodePredicateOperand(llvm::MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCCOutOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   if (Val)
     Inst.addOperand(MCOperand::CreateReg(ARM::CPSR));
@@ -1063,7 +1066,7 @@ static DecodeStatus DecodeCCOutOperand(llvm::MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSOImmOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeSOImmOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   uint32_t imm = Val & 0xFF;
   uint32_t rot = (Val & 0xF00) >> 7;
@@ -1072,7 +1075,7 @@ static DecodeStatus DecodeSOImmOperand(llvm::MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeSORegImmOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1109,7 +1112,7 @@ static DecodeStatus DecodeSORegImmOperand(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeSORegRegOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1144,7 +1147,7 @@ static DecodeStatus DecodeSORegRegOperand(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeRegListOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1179,7 +1182,7 @@ static DecodeStatus DecodeRegListOperand(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeSPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1196,7 +1199,7 @@ static DecodeStatus DecodeSPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeDPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1213,7 +1216,7 @@ static DecodeStatus DecodeDPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeBitfieldMaskOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Val,
                                       uint64_t Address, const void *Decoder) {
   // This operand encodes a mask of contiguous zeros between a specified MSB
   // and LSB.  To decode it, we create the mask of all bits MSB-and-lower,
@@ -1234,7 +1237,7 @@ static DecodeStatus DecodeBitfieldMaskOperand(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeCopMemInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
                                   uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1379,7 +1382,7 @@ static DecodeStatus DecodeCopMemInstruction(llvm::MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus
-DecodeAddrMode2IdxInstruction(llvm::MCInst &Inst, unsigned Insn,
+DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn,
                               uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1482,7 +1485,7 @@ DecodeAddrMode2IdxInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeSORegMemOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val,
                                   uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1523,7 +1526,7 @@ static DecodeStatus DecodeSORegMemOperand(llvm::MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus
-DecodeAddrMode3Instruction(llvm::MCInst &Inst, unsigned Insn,
+DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
                            uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1536,6 +1539,7 @@ DecodeAddrMode3Instruction(llvm::MCInst &Inst, unsigned Insn,
   unsigned pred = fieldFromInstruction32(Insn, 28, 4);
   unsigned W = fieldFromInstruction32(Insn, 21, 1);
   unsigned P = fieldFromInstruction32(Insn, 24, 1);
+  unsigned Rt2 = Rt + 1;
 
   bool writeback = (W == 1) | (P == 0);
 
@@ -1547,7 +1551,86 @@ DecodeAddrMode3Instruction(llvm::MCInst &Inst, unsigned Insn,
     case ARM::LDRD:
     case ARM::LDRD_PRE:
     case ARM::LDRD_POST:
-      if (Rt & 0x1) return MCDisassembler::Fail;
+      if (Rt & 0x1) S = MCDisassembler::SoftFail;
+      break;
+    default:
+      break;
+  }
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+      if (P == 0 && W == 1)
+        S = MCDisassembler::SoftFail;
+      
+      if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2))
+        S = MCDisassembler::SoftFail;
+      if (type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      if (Rt2 == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && fieldFromInstruction32(Insn, 8, 4))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::STRH:
+    case ARM::STRH_PRE:
+    case ARM::STRH_POST:
+      if (Rt == 15)
+        S = MCDisassembler::SoftFail;
+      if (writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      if (!type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (type && Rn == 15){
+        if (Rt2 == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (P == 0 && W == 1)
+        S = MCDisassembler::SoftFail;
+      if (!type && (Rt2 == 15 || Rm == 15 || Rm == Rt || Rm == Rt2))
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && Rn == 15)
+        S = MCDisassembler::SoftFail;
+      if (writeback && (Rn == Rt || Rn == Rt2))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRH:
+    case ARM::LDRH_PRE:
+    case ARM::LDRH_POST:
+      if (type && Rn == 15){
+        if (Rt == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (Rt == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRSH:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB:
+    case ARM::LDRSB_PRE:
+    case ARM::LDRSB_POST:
+      if (type && Rn == 15){
+        if (Rt == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (type && (Rt == 15 || (writeback && Rn == Rt)))
+        S = MCDisassembler::SoftFail;
+      if (!type && (Rt == 15 || Rm == 15))
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
       break;
     default:
       break;
@@ -1634,7 +1717,7 @@ DecodeAddrMode3Instruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeRFEInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1663,7 +1746,7 @@ static DecodeStatus DecodeRFEInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeMemMultipleWritebackInstruction(llvm::MCInst &Inst,
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -1748,7 +1831,7 @@ static DecodeStatus DecodeMemMultipleWritebackInstruction(llvm::MCInst &Inst,
   return S;
 }
 
-static DecodeStatus DecodeCPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   unsigned imod = fieldFromInstruction32(Insn, 18, 2);
   unsigned M = fieldFromInstruction32(Insn, 17, 1);
@@ -1788,7 +1871,7 @@ static DecodeStatus DecodeCPSInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2CPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   unsigned imod = fieldFromInstruction32(Insn, 9, 2);
   unsigned M = fieldFromInstruction32(Insn, 8, 1);
@@ -1828,7 +1911,7 @@ static DecodeStatus DecodeT2CPSInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2MOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1852,7 +1935,7 @@ static DecodeStatus DecodeT2MOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeArmMOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1878,7 +1961,7 @@ static DecodeStatus DecodeArmMOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeSMLAInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1906,7 +1989,7 @@ static DecodeStatus DecodeSMLAInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeAddrModeImm12Operand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
                            uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1926,7 +2009,7 @@ static DecodeStatus DecodeAddrModeImm12Operand(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeAddrMode5Operand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1945,13 +2028,28 @@ static DecodeStatus DecodeAddrMode5Operand(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeAddrMode7Operand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
 }
 
 static DecodeStatus
-DecodeBranchImmInstruction(llvm::MCInst &Inst, unsigned Insn,
+DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+                     uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned imm = (fieldFromInstruction32(Insn, 0, 11) << 0) |
+                 (fieldFromInstruction32(Insn, 11, 1) << 18) |
+                 (fieldFromInstruction32(Insn, 13, 1) << 17) |
+                 (fieldFromInstruction32(Insn, 16, 6) << 11) |
+                 (fieldFromInstruction32(Insn, 26, 1) << 19);
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<20>(imm<<1) + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<20>(imm << 1)));
+  return S;
+}
+
+static DecodeStatus
+DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
                            uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1977,7 +2075,7 @@ DecodeBranchImmInstruction(llvm::MCInst &Inst, unsigned Insn,
 }
 
 
-static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -1994,7 +2092,7 @@ static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2183,6 +2281,8 @@ static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Insn,
     case ARM::VLD2b8wb_register:
     case ARM::VLD2b16wb_register:
     case ARM::VLD2b32wb_register:
+      Inst.addOperand(MCOperand::CreateImm(0));
+      break;
     case ARM::VLD3d8_UPD:
     case ARM::VLD3d16_UPD:
     case ARM::VLD3d32_UPD:
@@ -2251,12 +2351,22 @@ static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Insn,
         !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
       return MCDisassembler::Fail;
     break;
+  case ARM::VLD2d8wb_fixed:
+  case ARM::VLD2d16wb_fixed:
+  case ARM::VLD2d32wb_fixed:
+  case ARM::VLD2b8wb_fixed:
+  case ARM::VLD2b16wb_fixed:
+  case ARM::VLD2b32wb_fixed:
+  case ARM::VLD2q8wb_fixed:
+  case ARM::VLD2q16wb_fixed:
+  case ARM::VLD2q32wb_fixed:
+    break;
   }
 
   return S;
 }
 
-static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2319,6 +2429,8 @@ static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
     case ARM::VST2b8wb_register:
     case ARM::VST2b16wb_register:
     case ARM::VST2b32wb_register:
+      if (Rm == 0xF)
+        return MCDisassembler::Fail;
       Inst.addOperand(MCOperand::CreateImm(0));
       break;
     case ARM::VST3d8_UPD:
@@ -2525,7 +2637,7 @@ static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2570,7 +2682,7 @@ static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2580,7 +2692,6 @@ static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Insn,
   unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
   unsigned align = fieldFromInstruction32(Insn, 4, 1);
   unsigned size = 1 << fieldFromInstruction32(Insn, 6, 2);
-  unsigned pred = fieldFromInstruction32(Insn, 22, 4);
   align *= 2*size;
 
   switch (Inst.getOpcode()) {
@@ -2611,20 +2722,15 @@ static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Insn,
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateImm(align));
 
-  if (Rm == 0xD)
-    Inst.addOperand(MCOperand::CreateReg(0));
-  else if (Rm != 0xF) {
+  if (Rm != 0xD && Rm != 0xF) {
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
       return MCDisassembler::Fail;
   }
 
-  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
-    return MCDisassembler::Fail;
-
   return S;
 }
 
-static DecodeStatus DecodeVLD3DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2659,7 +2765,7 @@ static DecodeStatus DecodeVLD3DupInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVLD4DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2712,7 +2818,7 @@ static DecodeStatus DecodeVLD4DupInstruction(llvm::MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus
-DecodeNEONModImmInstruction(llvm::MCInst &Inst, unsigned Insn,
+DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn,
                             uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2757,7 +2863,7 @@ DecodeNEONModImmInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVSHLMaxInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn,
                                         uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2776,31 +2882,31 @@ static DecodeStatus DecodeVSHLMaxInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeShiftRight8Imm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   Inst.addOperand(MCOperand::CreateImm(8 - Val));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeShiftRight16Imm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   Inst.addOperand(MCOperand::CreateImm(16 - Val));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeShiftRight32Imm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   Inst.addOperand(MCOperand::CreateImm(32 - Val));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeShiftRight64Imm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder) {
   Inst.addOperand(MCOperand::CreateImm(64 - Val));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2836,7 +2942,7 @@ static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn,
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
                                      uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2860,25 +2966,31 @@ static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn,
   return S;
 }
 
-static DecodeStatus DecodeThumbBROperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(SignExtend32<12>(Val << 1)));
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<12>(Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<12>(Val << 1)));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeT2BROperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val)));
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<22>(Val<<1) + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val)));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeThumbCmpBROperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(SignExtend32<7>(Val << 1)));
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<7>(Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<7>(Val << 1)));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeThumbAddrModeRR(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2893,7 +3005,7 @@ static DecodeStatus DecodeThumbAddrModeRR(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeThumbAddrModeIS(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
                                   uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2907,7 +3019,7 @@ static DecodeStatus DecodeThumbAddrModeIS(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeThumbAddrModePC(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
                                   uint64_t Address, const void *Decoder) {
   unsigned imm = Val << 2;
 
@@ -2917,7 +3029,7 @@ static DecodeStatus DecodeThumbAddrModePC(llvm::MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeThumbAddrModeSP(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
                                   uint64_t Address, const void *Decoder) {
   Inst.addOperand(MCOperand::CreateReg(ARM::SP));
   Inst.addOperand(MCOperand::CreateImm(Val));
@@ -2925,7 +3037,7 @@ static DecodeStatus DecodeThumbAddrModeSP(llvm::MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeT2AddrModeSOReg(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
                                   uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2942,7 +3054,7 @@ static DecodeStatus DecodeT2AddrModeSOReg(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeT2LoadShift(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
                               uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -2997,7 +3109,7 @@ static DecodeStatus DecodeT2LoadShift(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2Imm8S4(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
                            uint64_t Address, const void *Decoder) {
   int imm = Val & 0xFF;
   if (!(Val & 0x100)) imm *= -1;
@@ -3006,7 +3118,7 @@ static DecodeStatus DecodeT2Imm8S4(llvm::MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeT2AddrModeImm8s4(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3021,7 +3133,7 @@ static DecodeStatus DecodeT2AddrModeImm8s4(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeT2AddrModeImm0_1020s4(llvm::MCInst &Inst,unsigned Val,
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3036,7 +3148,7 @@ static DecodeStatus DecodeT2AddrModeImm0_1020s4(llvm::MCInst &Inst,unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeT2Imm8(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
                          uint64_t Address, const void *Decoder) {
   int imm = Val & 0xFF;
   if (Val == 0)
@@ -3049,7 +3161,7 @@ static DecodeStatus DecodeT2Imm8(llvm::MCInst &Inst, unsigned Val,
 }
 
 
-static DecodeStatus DecodeT2AddrModeImm8(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3080,7 +3192,7 @@ static DecodeStatus DecodeT2AddrModeImm8(llvm::MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeT2LdStPre(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3110,7 +3222,7 @@ static DecodeStatus DecodeT2LdStPre(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2AddrModeImm12(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
                                   uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3125,7 +3237,7 @@ static DecodeStatus DecodeT2AddrModeImm12(llvm::MCInst &Inst, unsigned Val,
 }
 
 
-static DecodeStatus DecodeThumbAddSPImm(llvm::MCInst &Inst, uint16_t Insn,
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
                                 uint64_t Address, const void *Decoder) {
   unsigned imm = fieldFromInstruction16(Insn, 0, 7);
 
@@ -3136,7 +3248,7 @@ static DecodeStatus DecodeThumbAddSPImm(llvm::MCInst &Inst, uint16_t Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeThumbAddSPReg(llvm::MCInst &Inst, uint16_t Insn,
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
                                 uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3161,7 +3273,7 @@ static DecodeStatus DecodeThumbAddSPReg(llvm::MCInst &Inst, uint16_t Insn,
   return S;
 }
 
-static DecodeStatus DecodeThumbCPS(llvm::MCInst &Inst, uint16_t Insn,
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
                            uint64_t Address, const void *Decoder) {
   unsigned imod = fieldFromInstruction16(Insn, 4, 1) | 0x2;
   unsigned flags = fieldFromInstruction16(Insn, 0, 3);
@@ -3172,20 +3284,20 @@ static DecodeStatus DecodeThumbCPS(llvm::MCInst &Inst, uint16_t Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodePostIdxReg(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
                              uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
   unsigned add = fieldFromInstruction32(Insn, 4, 1);
 
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateImm(add));
 
   return S;
 }
 
-static DecodeStatus DecodeThumbBLXOffset(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   if (!tryAddingSymbolicOperand(Address,
                                 (Address & ~2u) + SignExtend32<22>(Val << 1) + 4,
@@ -3194,7 +3306,7 @@ static DecodeStatus DecodeThumbBLXOffset(llvm::MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCoprocessor(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
                               uint64_t Address, const void *Decoder) {
   if (Val == 0xA || Val == 0xB)
     return MCDisassembler::Fail;
@@ -3204,7 +3316,7 @@ static DecodeStatus DecodeCoprocessor(llvm::MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus
-DecodeThumbTableBranch(llvm::MCInst &Inst, unsigned Insn,
+DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
                        uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3220,7 +3332,7 @@ DecodeThumbTableBranch(llvm::MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus
-DecodeThumb2BCCInstruction(llvm::MCInst &Inst, unsigned Insn,
+DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
                            uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3262,7 +3374,7 @@ DecodeThumb2BCCInstruction(llvm::MCInst &Inst, unsigned Insn,
 // Decode a shifted immediate operand.  These basically consist
 // of an 8-bit value, and a 4-bit directive that specifies either
 // a splat operation or a rotation.
-static DecodeStatus DecodeT2SOImm(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
                           uint64_t Address, const void *Decoder) {
   unsigned ctrl = fieldFromInstruction32(Val, 10, 2);
   if (ctrl == 0) {
@@ -3294,13 +3406,15 @@ static DecodeStatus DecodeT2SOImm(llvm::MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus
-DecodeThumbBCCTargetOperand(llvm::MCInst &Inst, unsigned Val,
+DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
                             uint64_t Address, const void *Decoder){
-  Inst.addOperand(MCOperand::CreateImm(Val << 1));
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<8>(Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<8>(Val << 1)));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeThumbBLTargetOperand(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
                                        uint64_t Address, const void *Decoder){
   if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<22>(Val<<1) + 4,
                                 true, 4, Inst, Decoder))
@@ -3308,7 +3422,7 @@ static DecodeStatus DecodeThumbBLTargetOperand(llvm::MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMemBarrierOption(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   switch (Val) {
   default:
@@ -3328,14 +3442,14 @@ static DecodeStatus DecodeMemBarrierOption(llvm::MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeMSRMask(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
                           uint64_t Address, const void *Decoder) {
   if (!Val) return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateImm(Val));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeDoubleRegLoad(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
                                         uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3358,7 +3472,7 @@ static DecodeStatus DecodeDoubleRegLoad(llvm::MCInst &Inst, unsigned Insn,
 }
 
 
-static DecodeStatus DecodeDoubleRegStore(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
                                          uint64_t Address, const void *Decoder){
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3385,7 +3499,7 @@ static DecodeStatus DecodeDoubleRegStore(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeLDRPreImm(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
                             uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3410,7 +3524,7 @@ static DecodeStatus DecodeLDRPreImm(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeLDRPreReg(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
                             uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3438,7 +3552,7 @@ static DecodeStatus DecodeLDRPreReg(llvm::MCInst &Inst, unsigned Insn,
 }
 
 
-static DecodeStatus DecodeSTRPreImm(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
                             uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3463,7 +3577,7 @@ static DecodeStatus DecodeSTRPreImm(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeSTRPreReg(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
                             uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3488,7 +3602,7 @@ static DecodeStatus DecodeSTRPreReg(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVLD1LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3547,7 +3661,7 @@ static DecodeStatus DecodeVLD1LN(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVST1LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3605,7 +3719,7 @@ static DecodeStatus DecodeVST1LN(llvm::MCInst &Inst, unsigned Insn,
 }
 
 
-static DecodeStatus DecodeVLD2LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3672,7 +3786,7 @@ static DecodeStatus DecodeVLD2LN(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVST2LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3736,7 +3850,7 @@ static DecodeStatus DecodeVST2LN(llvm::MCInst &Inst, unsigned Insn,
 }
 
 
-static DecodeStatus DecodeVLD3LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3806,7 +3920,7 @@ static DecodeStatus DecodeVLD3LN(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVST3LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3870,7 +3984,7 @@ static DecodeStatus DecodeVST3LN(llvm::MCInst &Inst, unsigned Insn,
 }
 
 
-static DecodeStatus DecodeVLD4LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -3944,7 +4058,7 @@ static DecodeStatus DecodeVLD4LN(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVST4LN(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -4009,7 +4123,7 @@ static DecodeStatus DecodeVST4LN(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVMOVSRR(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
                                   uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
@@ -4035,7 +4149,7 @@ static DecodeStatus DecodeVMOVSRR(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
                                   uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
@@ -4061,7 +4175,7 @@ static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeIT(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
                              uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
   unsigned pred = fieldFromInstruction16(Insn, 4, 4);
@@ -4088,7 +4202,7 @@ static DecodeStatus DecodeIT(llvm::MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus
-DecodeT2LDRDPreInstruction(llvm::MCInst &Inst, unsigned Insn,
+DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
                            uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -4125,7 +4239,7 @@ DecodeT2LDRDPreInstruction(llvm::MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus
-DecodeT2STRDPreInstruction(llvm::MCInst &Inst, unsigned Insn,
+DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
                            uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
@@ -4159,7 +4273,7 @@ DecodeT2STRDPreInstruction(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeT2Adr(llvm::MCInst &Inst, uint32_t Insn,
+static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn,
                                 uint64_t Address, const void *Decoder) {
   unsigned sign1 = fieldFromInstruction32(Insn, 21, 1);
   unsigned sign2 = fieldFromInstruction32(Insn, 23, 1);
@@ -4174,7 +4288,7 @@ static DecodeStatus DecodeT2Adr(llvm::MCInst &Inst, uint32_t Insn,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeT2ShifterImmOperand(llvm::MCInst &Inst, uint32_t Val,
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val,
                                               uint64_t Address,
                                               const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4185,7 +4299,7 @@ static DecodeStatus DecodeT2ShifterImmOperand(llvm::MCInst &Inst, uint32_t Val,
   return S;
 }
 
-static DecodeStatus DecodeSwap(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder) {
   unsigned Rt   = fieldFromInstruction32(Insn, 12, 4);
   unsigned Rt2  = fieldFromInstruction32(Insn, 0,  4);
@@ -4196,6 +4310,10 @@ static DecodeStatus DecodeSwap(llvm::MCInst &Inst, unsigned Insn,
     return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
 
   DecodeStatus S = MCDisassembler::Success;
+
+  if (Rt == Rn || Rn == Rt2)
+    S = MCDisassembler::SoftFail;
+
   if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
@@ -4208,7 +4326,7 @@ static DecodeStatus DecodeSwap(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVCVTD(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder) {
   unsigned Vd = (fieldFromInstruction32(Insn, 12, 4) << 0);
   Vd |= (fieldFromInstruction32(Insn, 22, 1) << 4);
@@ -4236,7 +4354,7 @@ static DecodeStatus DecodeVCVTD(llvm::MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeVCVTQ(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder) {
   unsigned Vd = (fieldFromInstruction32(Insn, 12, 4) << 0);
   Vd |= (fieldFromInstruction32(Insn, 22, 1) << 4);
@@ -4263,3 +4381,59 @@ static DecodeStatus DecodeVCVTQ(llvm::MCInst &Inst, unsigned Insn,
 
   return S;
 }
+
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Val, 12, 4);
+  unsigned Rm = fieldFromInstruction32(Val, 0, 4);
+  Rm |= (fieldFromInstruction32(Val, 23, 1) << 4);
+  unsigned Cond = fieldFromInstruction32(Val, 28, 4);
+ 
+  if (fieldFromInstruction32(Val, 8, 4) != 0 || Rn == Rt)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrMode7Operand(Inst, Rn, Address, Decoder))) 
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePostIdxReg(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, Cond, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder) {
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned CRm = fieldFromInstruction32(Val, 0, 4);
+  unsigned opc1 = fieldFromInstruction32(Val, 4, 4);
+  unsigned cop = fieldFromInstruction32(Val, 8, 4);
+  unsigned Rt = fieldFromInstruction32(Val, 12, 4);
+  unsigned Rt2 = fieldFromInstruction32(Val, 16, 4);
+
+  if ((cop & ~0x1) == 0xa)
+    return MCDisassembler::Fail;
+
+  if (Rt == Rt2)
+    S = MCDisassembler::SoftFail;
+
+  Inst.addOperand(MCOperand::CreateImm(cop));
+  Inst.addOperand(MCOperand::CreateImm(opc1));
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(CRm));
+
+  return S;
+}
+
diff --git a/lib/Target/ARM/Disassembler/LLVMBuild.txt b/lib/Target/ARM/Disassembler/LLVMBuild.txt
index 94075a9..52d8338 100644
--- a/lib/Target/ARM/Disassembler/LLVMBuild.txt
+++ b/lib/Target/ARM/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = ARMDisassembler
 parent = ARM
-required_libraries = ARMCodeGen ARMDesc ARMInfo MC Support
+required_libraries = ARMDesc ARMInfo MC Support
 add_to_library_groups = ARM
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 2b994df..cbd81c1 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -18,11 +18,11 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-#define GET_INSTRUCTION_NAME
 #include "ARMGenAsmWriter.inc"
 
 /// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
@@ -36,17 +36,14 @@ static unsigned translateShiftImm(unsigned imm) {
 
 
 ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI,
+                               const MCInstrInfo &MII,
                                const MCRegisterInfo &MRI,
                                const MCSubtargetInfo &STI) :
-  MCInstPrinter(MAI, MRI) {
+  MCInstPrinter(MAI, MII, MRI) {
   // Initialize the set of available features.
   setAvailableFeatures(STI.getFeatureBits());
 }
 
-StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const {
-  return getInstructionName(Opcode);
-}
-
 void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
@@ -212,12 +209,12 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     // If a symbolic branch target was added as a constant expression then print
-    // that address in hex.
+    // that address in hex. And only print 32 unsigned bits for the address.
     const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
     int64_t Address;
     if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
       O << "0x";
-      O.write_hex(Address);
+      O.write_hex((uint32_t)Address);
     }
     else {
       // Otherwise, just print the expression.
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index e9cd407..8acb7ee 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -23,15 +23,12 @@ class MCOperand;
 
 class ARMInstPrinter : public MCInstPrinter {
 public:
-  ARMInstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI,
-                 const MCSubtargetInfo &STI);
+  ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-  virtual StringRef getOpcodeName(unsigned Opcode) const;
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
-  static const char *getInstructionName(unsigned Opcode);
-
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 25849ee..d10bfc1 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -11,11 +11,11 @@
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -78,7 +78,8 @@ public:
 { "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_bl",            0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbl",      0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_condbl",        0,            24,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_blx",           0,            24,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
@@ -115,6 +116,9 @@ public:
     // twiddled.
     if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 &&
         (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 &&
+        (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 &&
+        (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 &&
+        (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 &&
         (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) {
       if (A) {
         const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
@@ -128,7 +132,8 @@ public:
     if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
               (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl ||
               (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
-              (unsigned)Fixup.getKind() == ARM::fixup_arm_bl))
+              (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
+              (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
       IsResolved = false;
   }
 
@@ -366,7 +371,8 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
 
   case ARM::fixup_arm_condbranch:
   case ARM::fixup_arm_uncondbranch:
-  case ARM::fixup_arm_bl:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
   case ARM::fixup_arm_blx:
     // These values don't encode the low two bits since they're always zero.
     // Offset by 8 just as above.
@@ -466,7 +472,9 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
       Value = -Value;
       isAdd = false;
     }
+    // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
     assert ((Value < 256) && "Out of range pc-relative fixup value!");
+    Value = (Value & 0xf) | ((Value & 0xf0) << 4);
     return Value | (isAdd << 23);
   }
   case ARM::fixup_arm_pcrel_10:
@@ -577,7 +585,8 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case ARM::fixup_arm_ldst_pcrel_12:
   case ARM::fixup_arm_pcrel_10:
   case ARM::fixup_arm_adr_pcrel_12:
-  case ARM::fixup_arm_bl:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
   case ARM::fixup_arm_blx:
   case ARM::fixup_arm_condbranch:
   case ARM::fixup_arm_uncondbranch:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 5476a46..aa649ba 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -11,6 +11,7 @@
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -177,7 +178,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
         break;
       }
       break;
-    case ARM::fixup_arm_bl:
+    case ARM::fixup_arm_uncondbl:
     case ARM::fixup_arm_blx:
     case ARM::fixup_arm_uncondbranch:
       switch (Modifier) {
@@ -189,6 +190,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
         break;
       }
       break;
+    case ARM::fixup_arm_condbl:
     case ARM::fixup_arm_condbranch:
       Type = ELF::R_ARM_JUMP24;
       break;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index 1827986..0085feb 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -59,8 +59,21 @@ enum Fixups {
   // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
   fixup_arm_thumb_br,
 
-  // fixup_arm_bl - Fixup for ARM BL instructions.
-  fixup_arm_bl,
+  // The following fixups handle the ARM BL instructions. These can be
+  // conditionalised; however, the ARM ELF ABI requires a different relocation
+  // in that case: R_ARM_JUMP24 instead of R_ARM_CALL. The difference is that
+  // R_ARM_CALL is allowed to change the instruction to a BLX inline, which has
+  // no conditional version; R_ARM_JUMP24 would have to insert a veneer.
+  //
+  // MachO does not draw a distinction between the two cases, so it will treat
+  // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups.
+
+  // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions.
+  fixup_arm_uncondbl,
+
+  // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial
+  // conditionalisation.
+  fixup_arm_condbl,
 
   // fixup_arm_blx - Fixup for ARM BLX instructions.
   fixup_arm_blx,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 4445dcd..10d1c48 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -597,8 +597,12 @@ uint32_t ARMMCCodeEmitter::
 getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
                           SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand MO = MI.getOperand(OpIdx);
-  if (MO.isExpr())
-    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_bl, Fixups);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx, 
+                                      ARM::fixup_arm_condbl, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_uncondbl, Fixups);
+  }
 
   return MO.getImm() >> 2;
 }
@@ -1330,8 +1334,8 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
   // LDM/STM:
   //   {15-0}  = Bitfield of GPRs.
   unsigned Reg = MI.getOperand(Op).getReg();
-  bool SPRRegs = llvm::ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
-  bool DPRRegs = llvm::ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
+  bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
+  bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
 
   unsigned Binary = 0;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index ed27f9f..e3512cd 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -163,10 +163,11 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 static MCInstPrinter *createARMMCInstPrinter(const Target &T,
                                              unsigned SyntaxVariant,
                                              const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
                                              const MCRegisterInfo &MRI,
                                              const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
-    return new ARMInstPrinter(MAI, MRI, STI);
+    return new ARMInstPrinter(MAI, MII, MRI, STI);
   return 0;
 }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 9d3da14..8057cb6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -82,7 +82,8 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
   case ARM::fixup_arm_adr_pcrel_12:
   case ARM::fixup_arm_condbranch:
   case ARM::fixup_arm_uncondbranch:
-  case ARM::fixup_arm_bl:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
   case ARM::fixup_arm_blx:
     RelocType = unsigned(macho::RIT_ARM_Branch24Bit);
     // Report as 'long', even though that is not quite accurate.
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 4fcaecf..3eddda8 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -501,11 +501,6 @@ those operations and the ARMv6 scalar versions.
 
 //===---------------------------------------------------------------------===//
 
-ARM::MOVCCr is commutable (by flipping the condition). But we need to implement
-ARMInstrInfo::commuteInstruction() to support it.
-
-//===---------------------------------------------------------------------===//
-
 Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting
 LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g.
 ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg)
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 8cf7cac..e03e758 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInst.h"
 
 using namespace llvm;
diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index 27fce9b..36af204 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h
@@ -17,7 +17,6 @@
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "Thumb1RegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
   class ARMSubtarget;
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index def75dd..ecb4c2f 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -154,7 +154,7 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
     ++I;
   if (I != E) {
     unsigned NPredReg = 0;
-    ARMCC::CondCodes NCC = llvm::getITInstrPredicate(I, NPredReg);
+    ARMCC::CondCodes NCC = getITInstrPredicate(I, NPredReg);
     if (NCC == CC || NCC == OCC)
       return true;
   }
@@ -171,7 +171,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
     MachineInstr *MI = &*MBBI;
     DebugLoc dl = MI->getDebugLoc();
     unsigned PredReg = 0;
-    ARMCC::CondCodes CC = llvm::getITInstrPredicate(MI, PredReg);
+    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
     if (CC == ARMCC::AL) {
       ++MBBI;
       continue;
@@ -207,7 +207,7 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
       MI = NMI;
 
       unsigned NPredReg = 0;
-      ARMCC::CondCodes NCC = llvm::getITInstrPredicate(NMI, NPredReg);
+      ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
       if (NCC == CC || NCC == OCC) {
         Mask |= (NCC & 1) << Pos;
         // Add implicit use of ITSTATE.
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 2fe4b85..8ab486b 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -59,7 +58,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
   // If the first instruction of Tail is predicated, we may have to update
   // the IT instruction.
   unsigned PredReg = 0;
-  ARMCC::CondCodes CC = llvm::getInstrPredicate(Tail, PredReg);
+  ARMCC::CondCodes CC = getInstrPredicate(Tail, PredReg);
   MachineBasicBlock::iterator MBBI = Tail;
   if (CC != ARMCC::AL)
     // Expecting at least the t2IT instruction before it.
@@ -107,7 +106,7 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
   }
 
   unsigned PredReg = 0;
-  return llvm::getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
+  return getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
 }
 
 void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -574,7 +573,7 @@ Thumb2InstrInfo::scheduleTwoAddrSource(MachineInstr *SrcMI,
     return;
 
   unsigned PredReg = 0;
-  ARMCC::CondCodes CC = llvm::getInstrPredicate(UseMI, PredReg);
+  ARMCC::CondCodes CC = getInstrPredicate(UseMI, PredReg);
   if (CC == ARMCC::AL || PredReg != ARM::CPSR)
     return;
 
@@ -590,7 +589,7 @@ Thumb2InstrInfo::scheduleTwoAddrSource(MachineInstr *SrcMI,
       continue;
 
     MachineInstr *NMI = &*MBBI;
-    ARMCC::CondCodes NCC = llvm::getInstrPredicate(NMI, PredReg);
+    ARMCC::CondCodes NCC = getInstrPredicate(NMI, PredReg);
     if (!(NCC == CC || NCC == OCC) ||
         NMI->modifiesRegister(SrcReg, &TRI) ||
         NMI->modifiesRegister(ARM::CPSR, &TRI))
@@ -611,5 +610,5 @@ llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
   unsigned Opc = MI->getOpcode();
   if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
     return ARMCC::AL;
-  return llvm::getInstrPredicate(MI, PredReg);
+  return getInstrPredicate(MI, PredReg);
 }
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 1ae2ef1..0911f8a 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -15,9 +15,8 @@
 #define THUMB2INSTRUCTIONINFO_H
 
 #include "ARM.h"
-#include "ARMInstrInfo.h"
+#include "ARMBaseInstrInfo.h"
 #include "Thumb2RegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
 class ARMSubtarget;
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index fb9d93b..b5a397e 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -851,7 +851,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
   // If this BB loops back to itself, conservatively avoid narrowing the
   // first instruction that does partial flag update.
   bool IsSelfLoop = MBB.isSuccessor(&MBB);
-  MachineBasicBlock::instr_iterator MII = MBB.instr_begin(), E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end();
   MachineBasicBlock::instr_iterator NextMII;
   for (; MII != E; MII = NextMII) {
     NextMII = llvm::next(MII);
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
deleted file mode 100644
index b6b209e..0000000
--- a/lib/Target/CBackend/CBackend.cpp
+++ /dev/null
@@ -1,3616 +0,0 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This library converts LLVM code to C code, compilable by GCC and other C
-// compilers.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CTargetMachine.h"
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/PassManager.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/InlineAsm.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/ConstantsScanner.h"
-#include "llvm/Analysis/FindUsedTypes.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/InstVisitor.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Config/config.h"
-#include <algorithm>
-// Some ms header decided to define setjmp as _setjmp, undo this for this file.
-#ifdef _MSC_VER
-#undef setjmp
-#endif
-using namespace llvm;
-
-extern "C" void LLVMInitializeCBackendTarget() {
-  // Register the target.
-  RegisterTargetMachine<CTargetMachine> X(TheCBackendTarget);
-}
-
-namespace {
-  class CBEMCAsmInfo : public MCAsmInfo {
-  public:
-    CBEMCAsmInfo() {
-      GlobalPrefix = "";
-      PrivateGlobalPrefix = "";
-    }
-  };
-
-  /// CWriter - This class is the main chunk of code that converts an LLVM
-  /// module to a C translation unit.
-  class CWriter : public FunctionPass, public InstVisitor<CWriter> {
-    formatted_raw_ostream &Out;
-    IntrinsicLowering *IL;
-    Mangler *Mang;
-    LoopInfo *LI;
-    const Module *TheModule;
-    const MCAsmInfo* TAsm;
-    const MCRegisterInfo *MRI;
-    const MCObjectFileInfo *MOFI;
-    MCContext *TCtx;
-    const TargetData* TD;
-    
-    std::map<const ConstantFP *, unsigned> FPConstantMap;
-    std::set<Function*> intrinsicPrototypesAlreadyGenerated;
-    std::set<const Argument*> ByValParams;
-    unsigned FPCounter;
-    unsigned OpaqueCounter;
-    DenseMap<const Value*, unsigned> AnonValueNumbers;
-    unsigned NextAnonValueNumber;
-
-    /// UnnamedStructIDs - This contains a unique ID for each struct that is
-    /// either anonymous or has no name.
-    DenseMap<StructType*, unsigned> UnnamedStructIDs;
-    
-  public:
-    static char ID;
-    explicit CWriter(formatted_raw_ostream &o)
-      : FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0),
-        TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
-        OpaqueCounter(0), NextAnonValueNumber(0) {
-      initializeLoopInfoPass(*PassRegistry::getPassRegistry());
-      FPCounter = 0;
-    }
-
-    virtual const char *getPassName() const { return "C backend"; }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<LoopInfo>();
-      AU.setPreservesAll();
-    }
-
-    virtual bool doInitialization(Module &M);
-
-    bool runOnFunction(Function &F) {
-     // Do not codegen any 'available_externally' functions at all, they have
-     // definitions outside the translation unit.
-     if (F.hasAvailableExternallyLinkage())
-       return false;
-
-      LI = &getAnalysis<LoopInfo>();
-
-      // Get rid of intrinsics we can't handle.
-      lowerIntrinsics(F);
-
-      // Output all floating point constants that cannot be printed accurately.
-      printFloatingPointConstants(F);
-
-      printFunction(F);
-      return false;
-    }
-
-    virtual bool doFinalization(Module &M) {
-      // Free memory...
-      delete IL;
-      delete TD;
-      delete Mang;
-      delete TCtx;
-      delete TAsm;
-      delete MRI;
-      delete MOFI;
-      FPConstantMap.clear();
-      ByValParams.clear();
-      intrinsicPrototypesAlreadyGenerated.clear();
-      UnnamedStructIDs.clear();
-      return false;
-    }
-
-    raw_ostream &printType(raw_ostream &Out, Type *Ty,
-                           bool isSigned = false,
-                           const std::string &VariableName = "",
-                           bool IgnoreName = false,
-                           const AttrListPtr &PAL = AttrListPtr());
-    raw_ostream &printSimpleType(raw_ostream &Out, Type *Ty,
-                                 bool isSigned,
-                                 const std::string &NameSoFar = "");
-
-    void printStructReturnPointerFunctionType(raw_ostream &Out,
-                                              const AttrListPtr &PAL,
-                                              PointerType *Ty);
-
-    std::string getStructName(StructType *ST);
-    
-    /// writeOperandDeref - Print the result of dereferencing the specified
-    /// operand with '*'.  This is equivalent to printing '*' then using
-    /// writeOperand, but avoids excess syntax in some cases.
-    void writeOperandDeref(Value *Operand) {
-      if (isAddressExposed(Operand)) {
-        // Already something with an address exposed.
-        writeOperandInternal(Operand);
-      } else {
-        Out << "*(";
-        writeOperand(Operand);
-        Out << ")";
-      }
-    }
-
-    void writeOperand(Value *Operand, bool Static = false);
-    void writeInstComputationInline(Instruction &I);
-    void writeOperandInternal(Value *Operand, bool Static = false);
-    void writeOperandWithCast(Value* Operand, unsigned Opcode);
-    void writeOperandWithCast(Value* Operand, const ICmpInst &I);
-    bool writeInstructionCast(const Instruction &I);
-
-    void writeMemoryAccess(Value *Operand, Type *OperandType,
-                           bool IsVolatile, unsigned Alignment);
-
-  private :
-    std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c);
-
-    void lowerIntrinsics(Function &F);
-    /// Prints the definition of the intrinsic function F. Supports the 
-    /// intrinsics which need to be explicitly defined in the CBackend.
-    void printIntrinsicDefinition(const Function &F, raw_ostream &Out);
-
-    void printModuleTypes();
-    void printContainedStructs(Type *Ty, SmallPtrSet<Type *, 16> &);
-    void printFloatingPointConstants(Function &F);
-    void printFloatingPointConstants(const Constant *C);
-    void printFunctionSignature(const Function *F, bool Prototype);
-
-    void printFunction(Function &);
-    void printBasicBlock(BasicBlock *BB);
-    void printLoop(Loop *L);
-
-    void printCast(unsigned opcode, Type *SrcTy, Type *DstTy);
-    void printConstant(Constant *CPV, bool Static);
-    void printConstantWithCast(Constant *CPV, unsigned Opcode);
-    bool printConstExprCast(const ConstantExpr *CE, bool Static);
-    void printConstantArray(ConstantArray *CPA, bool Static);
-    void printConstantVector(ConstantVector *CV, bool Static);
-    void printConstantDataSequential(ConstantDataSequential *CDS, bool Static);
-
-
-    /// isAddressExposed - Return true if the specified value's name needs to
-    /// have its address taken in order to get a C value of the correct type.
-    /// This happens for global variables, byval parameters, and direct allocas.
-    bool isAddressExposed(const Value *V) const {
-      if (const Argument *A = dyn_cast<Argument>(V))
-        return ByValParams.count(A);
-      return isa<GlobalVariable>(V) || isDirectAlloca(V);
-    }
-
-    // isInlinableInst - Attempt to inline instructions into their uses to build
-    // trees as much as possible.  To do this, we have to consistently decide
-    // what is acceptable to inline, so that variable declarations don't get
-    // printed and an extra copy of the expr is not emitted.
-    //
-    static bool isInlinableInst(const Instruction &I) {
-      // Always inline cmp instructions, even if they are shared by multiple
-      // expressions.  GCC generates horrible code if we don't.
-      if (isa<CmpInst>(I))
-        return true;
-
-      // Must be an expression, must be used exactly once.  If it is dead, we
-      // emit it inline where it would go.
-      if (I.getType() == Type::getVoidTy(I.getContext()) || !I.hasOneUse() ||
-          isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) ||
-          isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
-          isa<InsertValueInst>(I))
-        // Don't inline a load across a store or other bad things!
-        return false;
-
-      // Must not be used in inline asm, extractelement, or shufflevector.
-      if (I.hasOneUse()) {
-        const Instruction &User = cast<Instruction>(*I.use_back());
-        if (isInlineAsm(User) || isa<ExtractElementInst>(User) ||
-            isa<ShuffleVectorInst>(User))
-          return false;
-      }
-
-      // Only inline instruction it if it's use is in the same BB as the inst.
-      return I.getParent() == cast<Instruction>(I.use_back())->getParent();
-    }
-
-    // isDirectAlloca - Define fixed sized allocas in the entry block as direct
-    // variables which are accessed with the & operator.  This causes GCC to
-    // generate significantly better code than to emit alloca calls directly.
-    //
-    static const AllocaInst *isDirectAlloca(const Value *V) {
-      const AllocaInst *AI = dyn_cast<AllocaInst>(V);
-      if (!AI) return 0;
-      if (AI->isArrayAllocation())
-        return 0;   // FIXME: we can also inline fixed size array allocas!
-      if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock())
-        return 0;
-      return AI;
-    }
-
-    // isInlineAsm - Check if the instruction is a call to an inline asm chunk.
-    static bool isInlineAsm(const Instruction& I) {
-      if (const CallInst *CI = dyn_cast<CallInst>(&I))
-        return isa<InlineAsm>(CI->getCalledValue());
-      return false;
-    }
-
-    // Instruction visitation functions
-    friend class InstVisitor<CWriter>;
-
-    void visitReturnInst(ReturnInst &I);
-    void visitBranchInst(BranchInst &I);
-    void visitSwitchInst(SwitchInst &I);
-    void visitIndirectBrInst(IndirectBrInst &I);
-    void visitInvokeInst(InvokeInst &I) {
-      llvm_unreachable("Lowerinvoke pass didn't work!");
-    }
-    void visitResumeInst(ResumeInst &I) {
-      llvm_unreachable("DwarfEHPrepare pass didn't work!");
-    }
-    void visitUnreachableInst(UnreachableInst &I);
-
-    void visitPHINode(PHINode &I);
-    void visitBinaryOperator(Instruction &I);
-    void visitICmpInst(ICmpInst &I);
-    void visitFCmpInst(FCmpInst &I);
-
-    void visitCastInst (CastInst &I);
-    void visitSelectInst(SelectInst &I);
-    void visitCallInst (CallInst &I);
-    void visitInlineAsm(CallInst &I);
-    bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID, bool &WroteCallee);
-
-    void visitAllocaInst(AllocaInst &I);
-    void visitLoadInst  (LoadInst   &I);
-    void visitStoreInst (StoreInst  &I);
-    void visitGetElementPtrInst(GetElementPtrInst &I);
-    void visitVAArgInst (VAArgInst &I);
-
-    void visitInsertElementInst(InsertElementInst &I);
-    void visitExtractElementInst(ExtractElementInst &I);
-    void visitShuffleVectorInst(ShuffleVectorInst &SVI);
-
-    void visitInsertValueInst(InsertValueInst &I);
-    void visitExtractValueInst(ExtractValueInst &I);
-
-    void visitInstruction(Instruction &I) {
-#ifndef NDEBUG
-      errs() << "C Writer does not know about " << I;
-#endif
-      llvm_unreachable(0);
-    }
-
-    void outputLValue(Instruction *I) {
-      Out << "  " << GetValueName(I) << " = ";
-    }
-
-    bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To);
-    void printPHICopiesForSuccessor(BasicBlock *CurBlock,
-                                    BasicBlock *Successor, unsigned Indent);
-    void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock,
-                            unsigned Indent);
-    void printGEPExpression(Value *Ptr, gep_type_iterator I,
-                            gep_type_iterator E, bool Static);
-
-    std::string GetValueName(const Value *Operand);
-  };
-}
-
-char CWriter::ID = 0;
-
-
-
-static std::string CBEMangle(const std::string &S) {
-  std::string Result;
-
-  for (unsigned i = 0, e = S.size(); i != e; ++i)
-    if (isalnum(S[i]) || S[i] == '_') {
-      Result += S[i];
-    } else {
-      Result += '_';
-      Result += 'A'+(S[i]&15);
-      Result += 'A'+((S[i]>>4)&15);
-      Result += '_';
-    }
-  return Result;
-}
-
-std::string CWriter::getStructName(StructType *ST) {
-  if (!ST->isLiteral() && !ST->getName().empty())
-    return CBEMangle("l_"+ST->getName().str());
-  
-  return "l_unnamed_" + utostr(UnnamedStructIDs[ST]);
-}
-
-
-/// printStructReturnPointerFunctionType - This is like printType for a struct
-/// return type, except, instead of printing the type as void (*)(Struct*, ...)
-/// print it as "Struct (*)(...)", for struct return functions.
-void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
-                                                   const AttrListPtr &PAL,
-                                                   PointerType *TheTy) {
-  FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
-  std::string tstr;
-  raw_string_ostream FunctionInnards(tstr);
-  FunctionInnards << " (*) (";
-  bool PrintedType = false;
-
-  FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
-  Type *RetTy = cast<PointerType>(*I)->getElementType();
-  unsigned Idx = 1;
-  for (++I, ++Idx; I != E; ++I, ++Idx) {
-    if (PrintedType)
-      FunctionInnards << ", ";
-    Type *ArgTy = *I;
-    if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-      assert(ArgTy->isPointerTy());
-      ArgTy = cast<PointerType>(ArgTy)->getElementType();
-    }
-    printType(FunctionInnards, ArgTy,
-        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
-    PrintedType = true;
-  }
-  if (FTy->isVarArg()) {
-    if (!PrintedType)
-      FunctionInnards << " int"; //dummy argument for empty vararg functs
-    FunctionInnards << ", ...";
-  } else if (!PrintedType) {
-    FunctionInnards << "void";
-  }
-  FunctionInnards << ')';
-  printType(Out, RetTy,
-      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), FunctionInnards.str());
-}
-
-raw_ostream &
-CWriter::printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned,
-                         const std::string &NameSoFar) {
-  assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) &&
-         "Invalid type for printSimpleType");
-  switch (Ty->getTypeID()) {
-  case Type::VoidTyID:   return Out << "void " << NameSoFar;
-  case Type::IntegerTyID: {
-    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-    if (NumBits == 1)
-      return Out << "bool " << NameSoFar;
-    else if (NumBits <= 8)
-      return Out << (isSigned?"signed":"unsigned") << " char " << NameSoFar;
-    else if (NumBits <= 16)
-      return Out << (isSigned?"signed":"unsigned") << " short " << NameSoFar;
-    else if (NumBits <= 32)
-      return Out << (isSigned?"signed":"unsigned") << " int " << NameSoFar;
-    else if (NumBits <= 64)
-      return Out << (isSigned?"signed":"unsigned") << " long long "<< NameSoFar;
-    else {
-      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
-      return Out << (isSigned?"llvmInt128":"llvmUInt128") << " " << NameSoFar;
-    }
-  }
-  case Type::FloatTyID:  return Out << "float "   << NameSoFar;
-  case Type::DoubleTyID: return Out << "double "  << NameSoFar;
-  // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
-  // present matches host 'long double'.
-  case Type::X86_FP80TyID:
-  case Type::PPC_FP128TyID:
-  case Type::FP128TyID:  return Out << "long double " << NameSoFar;
-
-  case Type::X86_MMXTyID:
-    return printSimpleType(Out, Type::getInt32Ty(Ty->getContext()), isSigned,
-                     " __attribute__((vector_size(64))) " + NameSoFar);
-
-  case Type::VectorTyID: {
-    VectorType *VTy = cast<VectorType>(Ty);
-    return printSimpleType(Out, VTy->getElementType(), isSigned,
-                     " __attribute__((vector_size(" +
-                     utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
-  }
-
-  default:
-#ifndef NDEBUG
-    errs() << "Unknown primitive type: " << *Ty << "\n";
-#endif
-    llvm_unreachable(0);
-  }
-}
-
-// Pass the Type* and the variable name and this prints out the variable
-// declaration.
-//
-raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
-                                bool isSigned, const std::string &NameSoFar,
-                                bool IgnoreName, const AttrListPtr &PAL) {
-  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) {
-    printSimpleType(Out, Ty, isSigned, NameSoFar);
-    return Out;
-  }
-
-  switch (Ty->getTypeID()) {
-  case Type::FunctionTyID: {
-    FunctionType *FTy = cast<FunctionType>(Ty);
-    std::string tstr;
-    raw_string_ostream FunctionInnards(tstr);
-    FunctionInnards << " (" << NameSoFar << ") (";
-    unsigned Idx = 1;
-    for (FunctionType::param_iterator I = FTy->param_begin(),
-           E = FTy->param_end(); I != E; ++I) {
-      Type *ArgTy = *I;
-      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-        assert(ArgTy->isPointerTy());
-        ArgTy = cast<PointerType>(ArgTy)->getElementType();
-      }
-      if (I != FTy->param_begin())
-        FunctionInnards << ", ";
-      printType(FunctionInnards, ArgTy,
-        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
-      ++Idx;
-    }
-    if (FTy->isVarArg()) {
-      if (!FTy->getNumParams())
-        FunctionInnards << " int"; //dummy argument for empty vaarg functs
-      FunctionInnards << ", ...";
-    } else if (!FTy->getNumParams()) {
-      FunctionInnards << "void";
-    }
-    FunctionInnards << ')';
-    printType(Out, FTy->getReturnType(),
-      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), FunctionInnards.str());
-    return Out;
-  }
-  case Type::StructTyID: {
-    StructType *STy = cast<StructType>(Ty);
-    
-    // Check to see if the type is named.
-    if (!IgnoreName)
-      return Out << getStructName(STy) << ' ' << NameSoFar;
-    
-    Out << NameSoFar + " {\n";
-    unsigned Idx = 0;
-    for (StructType::element_iterator I = STy->element_begin(),
-           E = STy->element_end(); I != E; ++I) {
-      Out << "  ";
-      printType(Out, *I, false, "field" + utostr(Idx++));
-      Out << ";\n";
-    }
-    Out << '}';
-    if (STy->isPacked())
-      Out << " __attribute__ ((packed))";
-    return Out;
-  }
-
-  case Type::PointerTyID: {
-    PointerType *PTy = cast<PointerType>(Ty);
-    std::string ptrName = "*" + NameSoFar;
-
-    if (PTy->getElementType()->isArrayTy() ||
-        PTy->getElementType()->isVectorTy())
-      ptrName = "(" + ptrName + ")";
-
-    if (!PAL.isEmpty())
-      // Must be a function ptr cast!
-      return printType(Out, PTy->getElementType(), false, ptrName, true, PAL);
-    return printType(Out, PTy->getElementType(), false, ptrName);
-  }
-
-  case Type::ArrayTyID: {
-    ArrayType *ATy = cast<ArrayType>(Ty);
-    unsigned NumElements = ATy->getNumElements();
-    if (NumElements == 0) NumElements = 1;
-    // Arrays are wrapped in structs to allow them to have normal
-    // value semantics (avoiding the array "decay").
-    Out << NameSoFar << " { ";
-    printType(Out, ATy->getElementType(), false,
-              "array[" + utostr(NumElements) + "]");
-    return Out << "; }";
-  }
-
-  default:
-    llvm_unreachable("Unhandled case in getTypeProps!");
-  }
-}
-
-void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
-  Out << "{ ";
-  printConstant(cast<Constant>(CPA->getOperand(0)), Static);
-  for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
-    Out << ", ";
-    printConstant(cast<Constant>(CPA->getOperand(i)), Static);
-  }
-  Out << " }";
-}
-
-void CWriter::printConstantVector(ConstantVector *CP, bool Static) {
-  Out << "{ ";
-  printConstant(cast<Constant>(CP->getOperand(0)), Static);
-  for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
-    Out << ", ";
-    printConstant(cast<Constant>(CP->getOperand(i)), Static);
-  }
-  Out << " }";
-}
-
-void CWriter::printConstantDataSequential(ConstantDataSequential *CDS,
-                                          bool Static) {
-  // As a special case, print the array as a string if it is an array of
-  // ubytes or an array of sbytes with positive values.
-  //
-  if (CDS->isCString()) {
-    Out << '\"';
-    // Keep track of whether the last number was a hexadecimal escape.
-    bool LastWasHex = false;
-    
-    StringRef Bytes = CDS->getAsCString();
-    
-    // Do not include the last character, which we know is null
-    for (unsigned i = 0, e = Bytes.size(); i != e; ++i) {
-      unsigned char C = Bytes[i];
-      
-      // Print it out literally if it is a printable character.  The only thing
-      // to be careful about is when the last letter output was a hex escape
-      // code, in which case we have to be careful not to print out hex digits
-      // explicitly (the C compiler thinks it is a continuation of the previous
-      // character, sheesh...)
-      //
-      if (isprint(C) && (!LastWasHex || !isxdigit(C))) {
-        LastWasHex = false;
-        if (C == '"' || C == '\\')
-          Out << "\\" << (char)C;
-        else
-          Out << (char)C;
-      } else {
-        LastWasHex = false;
-        switch (C) {
-          case '\n': Out << "\\n"; break;
-          case '\t': Out << "\\t"; break;
-          case '\r': Out << "\\r"; break;
-          case '\v': Out << "\\v"; break;
-          case '\a': Out << "\\a"; break;
-          case '\"': Out << "\\\""; break;
-          case '\'': Out << "\\\'"; break;
-          default:
-            Out << "\\x";
-            Out << (char)(( C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'));
-            Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
-            LastWasHex = true;
-            break;
-        }
-      }
-    }
-    Out << '\"';
-  } else {
-    Out << "{ ";
-    printConstant(CDS->getElementAsConstant(0), Static);
-    for (unsigned i = 1, e = CDS->getNumElements(); i != e; ++i) {
-      Out << ", ";
-      printConstant(CDS->getElementAsConstant(i), Static);
-    }
-    Out << " }";
-  }
-}
-
-
-// isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
-// textually as a double (rather than as a reference to a stack-allocated
-// variable). We decide this by converting CFP to a string and back into a
-// double, and then checking whether the conversion results in a bit-equal
-// double to the original value of CFP. This depends on us and the target C
-// compiler agreeing on the conversion process (which is pretty likely since we
-// only deal in IEEE FP).
-//
-static bool isFPCSafeToPrint(const ConstantFP *CFP) {
-  bool ignored;
-  // Do long doubles in hex for now.
-  if (CFP->getType() != Type::getFloatTy(CFP->getContext()) &&
-      CFP->getType() != Type::getDoubleTy(CFP->getContext()))
-    return false;
-  APFloat APF = APFloat(CFP->getValueAPF());  // copy
-  if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
-    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
-#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
-  char Buffer[100];
-  sprintf(Buffer, "%a", APF.convertToDouble());
-  if (!strncmp(Buffer, "0x", 2) ||
-      !strncmp(Buffer, "-0x", 3) ||
-      !strncmp(Buffer, "+0x", 3))
-    return APF.bitwiseIsEqual(APFloat(atof(Buffer)));
-  return false;
-#else
-  std::string StrVal = ftostr(APF);
-
-  while (StrVal[0] == ' ')
-    StrVal.erase(StrVal.begin());
-
-  // Check to make sure that the stringized number is not some string like "Inf"
-  // or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
-  if ((StrVal[0] >= '0' && StrVal[0] <= '9') ||
-      ((StrVal[0] == '-' || StrVal[0] == '+') &&
-       (StrVal[1] >= '0' && StrVal[1] <= '9')))
-    // Reparse stringized version!
-    return APF.bitwiseIsEqual(APFloat(atof(StrVal.c_str())));
-  return false;
-#endif
-}
-
-/// Print out the casting for a cast operation. This does the double casting
-/// necessary for conversion to the destination type, if necessary.
-/// @brief Print a cast
-void CWriter::printCast(unsigned opc, Type *SrcTy, Type *DstTy) {
-  // Print the destination type cast
-  switch (opc) {
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-    case Instruction::IntToPtr:
-    case Instruction::Trunc:
-    case Instruction::BitCast:
-    case Instruction::FPExt:
-    case Instruction::FPTrunc: // For these the DstTy sign doesn't matter
-      Out << '(';
-      printType(Out, DstTy);
-      Out << ')';
-      break;
-    case Instruction::ZExt:
-    case Instruction::PtrToInt:
-    case Instruction::FPToUI: // For these, make sure we get an unsigned dest
-      Out << '(';
-      printSimpleType(Out, DstTy, false);
-      Out << ')';
-      break;
-    case Instruction::SExt:
-    case Instruction::FPToSI: // For these, make sure we get a signed dest
-      Out << '(';
-      printSimpleType(Out, DstTy, true);
-      Out << ')';
-      break;
-    default:
-      llvm_unreachable("Invalid cast opcode");
-  }
-
-  // Print the source type cast
-  switch (opc) {
-    case Instruction::UIToFP:
-    case Instruction::ZExt:
-      Out << '(';
-      printSimpleType(Out, SrcTy, false);
-      Out << ')';
-      break;
-    case Instruction::SIToFP:
-    case Instruction::SExt:
-      Out << '(';
-      printSimpleType(Out, SrcTy, true);
-      Out << ')';
-      break;
-    case Instruction::IntToPtr:
-    case Instruction::PtrToInt:
-      // Avoid "cast to pointer from integer of different size" warnings
-      Out << "(unsigned long)";
-      break;
-    case Instruction::Trunc:
-    case Instruction::BitCast:
-    case Instruction::FPExt:
-    case Instruction::FPTrunc:
-    case Instruction::FPToSI:
-    case Instruction::FPToUI:
-      break; // These don't need a source cast.
-    default:
-      llvm_unreachable("Invalid cast opcode");
-  }
-}
-
-// printConstant - The LLVM Constant to C Constant converter.
-void CWriter::printConstant(Constant *CPV, bool Static) {
-  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
-    switch (CE->getOpcode()) {
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPTrunc:
-    case Instruction::FPExt:
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::BitCast:
-      Out << "(";
-      printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType());
-      if (CE->getOpcode() == Instruction::SExt &&
-          CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) {
-        // Make sure we really sext from bool here by subtracting from 0
-        Out << "0-";
-      }
-      printConstant(CE->getOperand(0), Static);
-      if (CE->getType() == Type::getInt1Ty(CPV->getContext()) &&
-          (CE->getOpcode() == Instruction::Trunc ||
-           CE->getOpcode() == Instruction::FPToUI ||
-           CE->getOpcode() == Instruction::FPToSI ||
-           CE->getOpcode() == Instruction::PtrToInt)) {
-        // Make sure we really truncate to bool here by anding with 1
-        Out << "&1u";
-      }
-      Out << ')';
-      return;
-
-    case Instruction::GetElementPtr:
-      Out << "(";
-      printGEPExpression(CE->getOperand(0), gep_type_begin(CPV),
-                         gep_type_end(CPV), Static);
-      Out << ")";
-      return;
-    case Instruction::Select:
-      Out << '(';
-      printConstant(CE->getOperand(0), Static);
-      Out << '?';
-      printConstant(CE->getOperand(1), Static);
-      Out << ':';
-      printConstant(CE->getOperand(2), Static);
-      Out << ')';
-      return;
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::SDiv:
-    case Instruction::UDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-    case Instruction::ICmp:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    {
-      Out << '(';
-      bool NeedsClosingParens = printConstExprCast(CE, Static);
-      printConstantWithCast(CE->getOperand(0), CE->getOpcode());
-      switch (CE->getOpcode()) {
-      case Instruction::Add:
-      case Instruction::FAdd: Out << " + "; break;
-      case Instruction::Sub:
-      case Instruction::FSub: Out << " - "; break;
-      case Instruction::Mul:
-      case Instruction::FMul: Out << " * "; break;
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem: Out << " % "; break;
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv: Out << " / "; break;
-      case Instruction::And: Out << " & "; break;
-      case Instruction::Or:  Out << " | "; break;
-      case Instruction::Xor: Out << " ^ "; break;
-      case Instruction::Shl: Out << " << "; break;
-      case Instruction::LShr:
-      case Instruction::AShr: Out << " >> "; break;
-      case Instruction::ICmp:
-        switch (CE->getPredicate()) {
-          case ICmpInst::ICMP_EQ: Out << " == "; break;
-          case ICmpInst::ICMP_NE: Out << " != "; break;
-          case ICmpInst::ICMP_SLT:
-          case ICmpInst::ICMP_ULT: Out << " < "; break;
-          case ICmpInst::ICMP_SLE:
-          case ICmpInst::ICMP_ULE: Out << " <= "; break;
-          case ICmpInst::ICMP_SGT:
-          case ICmpInst::ICMP_UGT: Out << " > "; break;
-          case ICmpInst::ICMP_SGE:
-          case ICmpInst::ICMP_UGE: Out << " >= "; break;
-          default: llvm_unreachable("Illegal ICmp predicate");
-        }
-        break;
-      default: llvm_unreachable("Illegal opcode here!");
-      }
-      printConstantWithCast(CE->getOperand(1), CE->getOpcode());
-      if (NeedsClosingParens)
-        Out << "))";
-      Out << ')';
-      return;
-    }
-    case Instruction::FCmp: {
-      Out << '(';
-      bool NeedsClosingParens = printConstExprCast(CE, Static);
-      if (CE->getPredicate() == FCmpInst::FCMP_FALSE)
-        Out << "0";
-      else if (CE->getPredicate() == FCmpInst::FCMP_TRUE)
-        Out << "1";
-      else {
-        const char* op = 0;
-        switch (CE->getPredicate()) {
-        default: llvm_unreachable("Illegal FCmp predicate");
-        case FCmpInst::FCMP_ORD: op = "ord"; break;
-        case FCmpInst::FCMP_UNO: op = "uno"; break;
-        case FCmpInst::FCMP_UEQ: op = "ueq"; break;
-        case FCmpInst::FCMP_UNE: op = "une"; break;
-        case FCmpInst::FCMP_ULT: op = "ult"; break;
-        case FCmpInst::FCMP_ULE: op = "ule"; break;
-        case FCmpInst::FCMP_UGT: op = "ugt"; break;
-        case FCmpInst::FCMP_UGE: op = "uge"; break;
-        case FCmpInst::FCMP_OEQ: op = "oeq"; break;
-        case FCmpInst::FCMP_ONE: op = "one"; break;
-        case FCmpInst::FCMP_OLT: op = "olt"; break;
-        case FCmpInst::FCMP_OLE: op = "ole"; break;
-        case FCmpInst::FCMP_OGT: op = "ogt"; break;
-        case FCmpInst::FCMP_OGE: op = "oge"; break;
-        }
-        Out << "llvm_fcmp_" << op << "(";
-        printConstantWithCast(CE->getOperand(0), CE->getOpcode());
-        Out << ", ";
-        printConstantWithCast(CE->getOperand(1), CE->getOpcode());
-        Out << ")";
-      }
-      if (NeedsClosingParens)
-        Out << "))";
-      Out << ')';
-      return;
-    }
-    default:
-#ifndef NDEBUG
-      errs() << "CWriter Error: Unhandled constant expression: "
-           << *CE << "\n";
-#endif
-      llvm_unreachable(0);
-    }
-  } else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType()) {
-    Out << "((";
-    printType(Out, CPV->getType()); // sign doesn't matter
-    Out << ")/*UNDEF*/";
-    if (!CPV->getType()->isVectorTy()) {
-      Out << "0)";
-    } else {
-      Out << "{})";
-    }
-    return;
-  }
-
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
-    Type* Ty = CI->getType();
-    if (Ty == Type::getInt1Ty(CPV->getContext()))
-      Out << (CI->getZExtValue() ? '1' : '0');
-    else if (Ty == Type::getInt32Ty(CPV->getContext()))
-      Out << CI->getZExtValue() << 'u';
-    else if (Ty->getPrimitiveSizeInBits() > 32)
-      Out << CI->getZExtValue() << "ull";
-    else {
-      Out << "((";
-      printSimpleType(Out, Ty, false) << ')';
-      if (CI->isMinValue(true))
-        Out << CI->getZExtValue() << 'u';
-      else
-        Out << CI->getSExtValue();
-      Out << ')';
-    }
-    return;
-  }
-
-  switch (CPV->getType()->getTypeID()) {
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-  case Type::X86_FP80TyID:
-  case Type::PPC_FP128TyID:
-  case Type::FP128TyID: {
-    ConstantFP *FPC = cast<ConstantFP>(CPV);
-    std::map<const ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC);
-    if (I != FPConstantMap.end()) {
-      // Because of FP precision problems we must load from a stack allocated
-      // value that holds the value in hex.
-      Out << "(*(" << (FPC->getType() == Type::getFloatTy(CPV->getContext()) ?
-                       "float" :
-                       FPC->getType() == Type::getDoubleTy(CPV->getContext()) ?
-                       "double" :
-                       "long double")
-          << "*)&FPConstant" << I->second << ')';
-    } else {
-      double V;
-      if (FPC->getType() == Type::getFloatTy(CPV->getContext()))
-        V = FPC->getValueAPF().convertToFloat();
-      else if (FPC->getType() == Type::getDoubleTy(CPV->getContext()))
-        V = FPC->getValueAPF().convertToDouble();
-      else {
-        // Long double.  Convert the number to double, discarding precision.
-        // This is not awesome, but it at least makes the CBE output somewhat
-        // useful.
-        APFloat Tmp = FPC->getValueAPF();
-        bool LosesInfo;
-        Tmp.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &LosesInfo);
-        V = Tmp.convertToDouble();
-      }
-
-      if (IsNAN(V)) {
-        // The value is NaN
-
-        // FIXME the actual NaN bits should be emitted.
-        // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
-        // it's 0x7ff4.
-        const unsigned long QuietNaN = 0x7ff8UL;
-        //const unsigned long SignalNaN = 0x7ff4UL;
-
-        // We need to grab the first part of the FP #
-        char Buffer[100];
-
-        uint64_t ll = DoubleToBits(V);
-        sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
-
-        std::string Num(&Buffer[0], &Buffer[6]);
-        unsigned long Val = strtoul(Num.c_str(), 0, 16);
-
-        if (FPC->getType() == Type::getFloatTy(FPC->getContext()))
-          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\""
-              << Buffer << "\") /*nan*/ ";
-        else
-          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\""
-              << Buffer << "\") /*nan*/ ";
-      } else if (IsInf(V)) {
-        // The value is Inf
-        if (V < 0) Out << '-';
-        Out << "LLVM_INF" <<
-            (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F" : "")
-            << " /*inf*/ ";
-      } else {
-        std::string Num;
-#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
-        // Print out the constant as a floating point number.
-        char Buffer[100];
-        sprintf(Buffer, "%a", V);
-        Num = Buffer;
-#else
-        Num = ftostr(FPC->getValueAPF());
-#endif
-       Out << Num;
-      }
-    }
-    break;
-  }
-
-  case Type::ArrayTyID:
-    // Use C99 compound expression literal initializer syntax.
-    if (!Static) {
-      Out << "(";
-      printType(Out, CPV->getType());
-      Out << ")";
-    }
-    Out << "{ "; // Arrays are wrapped in struct types.
-    if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
-      printConstantArray(CA, Static);
-    } else if (ConstantDataSequential *CDS = 
-                 dyn_cast<ConstantDataSequential>(CPV)) {
-      printConstantDataSequential(CDS, Static);
-    } else {
-      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      ArrayType *AT = cast<ArrayType>(CPV->getType());
-      Out << '{';
-      if (AT->getNumElements()) {
-        Out << ' ';
-        Constant *CZ = Constant::getNullValue(AT->getElementType());
-        printConstant(CZ, Static);
-        for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) {
-          Out << ", ";
-          printConstant(CZ, Static);
-        }
-      }
-      Out << " }";
-    }
-    Out << " }"; // Arrays are wrapped in struct types.
-    break;
-
-  case Type::VectorTyID:
-    // Use C99 compound expression literal initializer syntax.
-    if (!Static) {
-      Out << "(";
-      printType(Out, CPV->getType());
-      Out << ")";
-    }
-    if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
-      printConstantVector(CV, Static);
-    } else if (ConstantDataSequential *CDS = 
-               dyn_cast<ConstantDataSequential>(CPV)) {
-      printConstantDataSequential(CDS, Static);
-    } else {
-      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      VectorType *VT = cast<VectorType>(CPV->getType());
-      Out << "{ ";
-      Constant *CZ = Constant::getNullValue(VT->getElementType());
-      printConstant(CZ, Static);
-      for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
-        Out << ", ";
-        printConstant(CZ, Static);
-      }
-      Out << " }";
-    }
-    break;
-
-  case Type::StructTyID:
-    // Use C99 compound expression literal initializer syntax.
-    if (!Static) {
-      Out << "(";
-      printType(Out, CPV->getType());
-      Out << ")";
-    }
-    if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
-      StructType *ST = cast<StructType>(CPV->getType());
-      Out << '{';
-      if (ST->getNumElements()) {
-        Out << ' ';
-        printConstant(Constant::getNullValue(ST->getElementType(0)), Static);
-        for (unsigned i = 1, e = ST->getNumElements(); i != e; ++i) {
-          Out << ", ";
-          printConstant(Constant::getNullValue(ST->getElementType(i)), Static);
-        }
-      }
-      Out << " }";
-    } else {
-      Out << '{';
-      if (CPV->getNumOperands()) {
-        Out << ' ';
-        printConstant(cast<Constant>(CPV->getOperand(0)), Static);
-        for (unsigned i = 1, e = CPV->getNumOperands(); i != e; ++i) {
-          Out << ", ";
-          printConstant(cast<Constant>(CPV->getOperand(i)), Static);
-        }
-      }
-      Out << " }";
-    }
-    break;
-
-  case Type::PointerTyID:
-    if (isa<ConstantPointerNull>(CPV)) {
-      Out << "((";
-      printType(Out, CPV->getType()); // sign doesn't matter
-      Out << ")/*NULL*/0)";
-      break;
-    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) {
-      writeOperand(GV, Static);
-      break;
-    }
-    // FALL THROUGH
-  default:
-#ifndef NDEBUG
-    errs() << "Unknown constant type: " << *CPV << "\n";
-#endif
-    llvm_unreachable(0);
-  }
-}
-
-// Some constant expressions need to be casted back to the original types
-// because their operands were casted to the expected type. This function takes
-// care of detecting that case and printing the cast for the ConstantExpr.
-bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
-  bool NeedsExplicitCast = false;
-  Type *Ty = CE->getOperand(0)->getType();
-  bool TypeIsSigned = false;
-  switch (CE->getOpcode()) {
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::Mul:
-    // We need to cast integer arithmetic so that it is always performed
-    // as unsigned, to avoid undefined behavior on overflow.
-  case Instruction::LShr:
-  case Instruction::URem:
-  case Instruction::UDiv: NeedsExplicitCast = true; break;
-  case Instruction::AShr:
-  case Instruction::SRem:
-  case Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break;
-  case Instruction::SExt:
-    Ty = CE->getType();
-    NeedsExplicitCast = true;
-    TypeIsSigned = true;
-    break;
-  case Instruction::ZExt:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::FPExt:
-  case Instruction::UIToFP:
-  case Instruction::SIToFP:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::BitCast:
-    Ty = CE->getType();
-    NeedsExplicitCast = true;
-    break;
-  default: break;
-  }
-  if (NeedsExplicitCast) {
-    Out << "((";
-    if (Ty->isIntegerTy() && Ty != Type::getInt1Ty(Ty->getContext()))
-      printSimpleType(Out, Ty, TypeIsSigned);
-    else
-      printType(Out, Ty); // not integer, sign doesn't matter
-    Out << ")(";
-  }
-  return NeedsExplicitCast;
-}
-
-//  Print a constant assuming that it is the operand for a given Opcode. The
-//  opcodes that care about sign need to cast their operands to the expected
-//  type before the operation proceeds. This function does the casting.
-void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
-
-  // Extract the operand's type, we'll need it.
-  Type* OpTy = CPV->getType();
-
-  // Indicate whether to do the cast or not.
-  bool shouldCast = false;
-  bool typeIsSigned = false;
-
-  // Based on the Opcode for which this Constant is being written, determine
-  // the new type to which the operand should be casted by setting the value
-  // of OpTy. If we change OpTy, also set shouldCast to true so it gets
-  // casted below.
-  switch (Opcode) {
-    default:
-      // for most instructions, it doesn't matter
-      break;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-      // We need to cast integer arithmetic so that it is always performed
-      // as unsigned, to avoid undefined behavior on overflow.
-    case Instruction::LShr:
-    case Instruction::UDiv:
-    case Instruction::URem:
-      shouldCast = true;
-      break;
-    case Instruction::AShr:
-    case Instruction::SDiv:
-    case Instruction::SRem:
-      shouldCast = true;
-      typeIsSigned = true;
-      break;
-  }
-
-  // Write out the casted constant if we should, otherwise just write the
-  // operand.
-  if (shouldCast) {
-    Out << "((";
-    printSimpleType(Out, OpTy, typeIsSigned);
-    Out << ")";
-    printConstant(CPV, false);
-    Out << ")";
-  } else
-    printConstant(CPV, false);
-}
-
-std::string CWriter::GetValueName(const Value *Operand) {
-
-  // Resolve potential alias.
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(Operand)) {
-    if (const Value *V = GA->resolveAliasedGlobal(false))
-      Operand = V;
-  }
-
-  // Mangle globals with the standard mangler interface for LLC compatibility.
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(Operand)) {
-    SmallString<128> Str;
-    Mang->getNameWithPrefix(Str, GV, false);
-    return CBEMangle(Str.str().str());
-  }
-
-  std::string Name = Operand->getName();
-
-  if (Name.empty()) { // Assign unique names to local temporaries.
-    unsigned &No = AnonValueNumbers[Operand];
-    if (No == 0)
-      No = ++NextAnonValueNumber;
-    Name = "tmp__" + utostr(No);
-  }
-
-  std::string VarName;
-  VarName.reserve(Name.capacity());
-
-  for (std::string::iterator I = Name.begin(), E = Name.end();
-       I != E; ++I) {
-    char ch = *I;
-
-    if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
-          (ch >= '0' && ch <= '9') || ch == '_')) {
-      char buffer[5];
-      sprintf(buffer, "_%x_", ch);
-      VarName += buffer;
-    } else
-      VarName += ch;
-  }
-
-  return "llvm_cbe_" + VarName;
-}
-
-/// writeInstComputationInline - Emit the computation for the specified
-/// instruction inline, with no destination provided.
-void CWriter::writeInstComputationInline(Instruction &I) {
-  // We can't currently support integer types other than 1, 8, 16, 32, 64.
-  // Validate this.
-  Type *Ty = I.getType();
-  if (Ty->isIntegerTy() && (Ty!=Type::getInt1Ty(I.getContext()) &&
-        Ty!=Type::getInt8Ty(I.getContext()) &&
-        Ty!=Type::getInt16Ty(I.getContext()) &&
-        Ty!=Type::getInt32Ty(I.getContext()) &&
-        Ty!=Type::getInt64Ty(I.getContext()))) {
-      report_fatal_error("The C backend does not currently support integer "
-                        "types of widths other than 1, 8, 16, 32, 64.\n"
-                        "This is being tracked as PR 4158.");
-  }
-
-  // If this is a non-trivial bool computation, make sure to truncate down to
-  // a 1 bit value.  This is important because we want "add i1 x, y" to return
-  // "0" when x and y are true, not "2" for example.
-  bool NeedBoolTrunc = false;
-  if (I.getType() == Type::getInt1Ty(I.getContext()) &&
-      !isa<ICmpInst>(I) && !isa<FCmpInst>(I))
-    NeedBoolTrunc = true;
-
-  if (NeedBoolTrunc)
-    Out << "((";
-
-  visit(I);
-
-  if (NeedBoolTrunc)
-    Out << ")&1)";
-}
-
-
-void CWriter::writeOperandInternal(Value *Operand, bool Static) {
-  if (Instruction *I = dyn_cast<Instruction>(Operand))
-    // Should we inline this instruction to build a tree?
-    if (isInlinableInst(*I) && !isDirectAlloca(I)) {
-      Out << '(';
-      writeInstComputationInline(*I);
-      Out << ')';
-      return;
-    }
-
-  Constant* CPV = dyn_cast<Constant>(Operand);
-
-  if (CPV && !isa<GlobalValue>(CPV))
-    printConstant(CPV, Static);
-  else
-    Out << GetValueName(Operand);
-}
-
-void CWriter::writeOperand(Value *Operand, bool Static) {
-  bool isAddressImplicit = isAddressExposed(Operand);
-  if (isAddressImplicit)
-    Out << "(&";  // Global variables are referenced as their addresses by llvm
-
-  writeOperandInternal(Operand, Static);
-
-  if (isAddressImplicit)
-    Out << ')';
-}
-
-// Some instructions need to have their result value casted back to the
-// original types because their operands were casted to the expected type.
-// This function takes care of detecting that case and printing the cast
-// for the Instruction.
-bool CWriter::writeInstructionCast(const Instruction &I) {
-  Type *Ty = I.getOperand(0)->getType();
-  switch (I.getOpcode()) {
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::Mul:
-    // We need to cast integer arithmetic so that it is always performed
-    // as unsigned, to avoid undefined behavior on overflow.
-  case Instruction::LShr:
-  case Instruction::URem:
-  case Instruction::UDiv:
-    Out << "((";
-    printSimpleType(Out, Ty, false);
-    Out << ")(";
-    return true;
-  case Instruction::AShr:
-  case Instruction::SRem:
-  case Instruction::SDiv:
-    Out << "((";
-    printSimpleType(Out, Ty, true);
-    Out << ")(";
-    return true;
-  default: break;
-  }
-  return false;
-}
-
-// Write the operand with a cast to another type based on the Opcode being used.
-// This will be used in cases where an instruction has specific type
-// requirements (usually signedness) for its operands.
-void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
-
-  // Extract the operand's type, we'll need it.
-  Type* OpTy = Operand->getType();
-
-  // Indicate whether to do the cast or not.
-  bool shouldCast = false;
-
-  // Indicate whether the cast should be to a signed type or not.
-  bool castIsSigned = false;
-
-  // Based on the Opcode for which this Operand is being written, determine
-  // the new type to which the operand should be casted by setting the value
-  // of OpTy. If we change OpTy, also set shouldCast to true.
-  switch (Opcode) {
-    default:
-      // for most instructions, it doesn't matter
-      break;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-      // We need to cast integer arithmetic so that it is always performed
-      // as unsigned, to avoid undefined behavior on overflow.
-    case Instruction::LShr:
-    case Instruction::UDiv:
-    case Instruction::URem: // Cast to unsigned first
-      shouldCast = true;
-      castIsSigned = false;
-      break;
-    case Instruction::GetElementPtr:
-    case Instruction::AShr:
-    case Instruction::SDiv:
-    case Instruction::SRem: // Cast to signed first
-      shouldCast = true;
-      castIsSigned = true;
-      break;
-  }
-
-  // Write out the casted operand if we should, otherwise just write the
-  // operand.
-  if (shouldCast) {
-    Out << "((";
-    printSimpleType(Out, OpTy, castIsSigned);
-    Out << ")";
-    writeOperand(Operand);
-    Out << ")";
-  } else
-    writeOperand(Operand);
-}
-
-// Write the operand with a cast to another type based on the icmp predicate
-// being used.
-void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
-  // This has to do a cast to ensure the operand has the right signedness.
-  // Also, if the operand is a pointer, we make sure to cast to an integer when
-  // doing the comparison both for signedness and so that the C compiler doesn't
-  // optimize things like "p < NULL" to false (p may contain an integer value
-  // f.e.).
-  bool shouldCast = Cmp.isRelational();
-
-  // Write out the casted operand if we should, otherwise just write the
-  // operand.
-  if (!shouldCast) {
-    writeOperand(Operand);
-    return;
-  }
-
-  // Should this be a signed comparison?  If so, convert to signed.
-  bool castIsSigned = Cmp.isSigned();
-
-  // If the operand was a pointer, convert to a large integer type.
-  Type* OpTy = Operand->getType();
-  if (OpTy->isPointerTy())
-    OpTy = TD->getIntPtrType(Operand->getContext());
-
-  Out << "((";
-  printSimpleType(Out, OpTy, castIsSigned);
-  Out << ")";
-  writeOperand(Operand);
-  Out << ")";
-}
-
-// generateCompilerSpecificCode - This is where we add conditional compilation
-// directives to cater to specific compilers as need be.
-//
-static void generateCompilerSpecificCode(formatted_raw_ostream& Out,
-                                         const TargetData *TD) {
-  // Alloca is hard to get, and we don't want to include stdlib.h here.
-  Out << "/* get a declaration for alloca */\n"
-      << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n"
-      << "#define  alloca(x) __builtin_alloca((x))\n"
-      << "#define _alloca(x) __builtin_alloca((x))\n"
-      << "#elif defined(__APPLE__)\n"
-      << "extern void *__builtin_alloca(unsigned long);\n"
-      << "#define alloca(x) __builtin_alloca(x)\n"
-      << "#define longjmp _longjmp\n"
-      << "#define setjmp _setjmp\n"
-      << "#elif defined(__sun__)\n"
-      << "#if defined(__sparcv9)\n"
-      << "extern void *__builtin_alloca(unsigned long);\n"
-      << "#else\n"
-      << "extern void *__builtin_alloca(unsigned int);\n"
-      << "#endif\n"
-      << "#define alloca(x) __builtin_alloca(x)\n"
-      << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__arm__)\n"
-      << "#define alloca(x) __builtin_alloca(x)\n"
-      << "#elif defined(_MSC_VER)\n"
-      << "#define inline _inline\n"
-      << "#define alloca(x) _alloca(x)\n"
-      << "#else\n"
-      << "#include <alloca.h>\n"
-      << "#endif\n\n";
-
-  // We output GCC specific attributes to preserve 'linkonce'ness on globals.
-  // If we aren't being compiled with GCC, just drop these attributes.
-  Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
-      << "#define __attribute__(X)\n"
-      << "#endif\n\n";
-
-  // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))".
-  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
-      << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
-      << "#elif defined(__GNUC__)\n"
-      << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
-      << "#else\n"
-      << "#define __EXTERNAL_WEAK__\n"
-      << "#endif\n\n";
-
-  // For now, turn off the weak linkage attribute on Mac OS X. (See above.)
-  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
-      << "#define __ATTRIBUTE_WEAK__\n"
-      << "#elif defined(__GNUC__)\n"
-      << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
-      << "#else\n"
-      << "#define __ATTRIBUTE_WEAK__\n"
-      << "#endif\n\n";
-
-  // Add hidden visibility support. FIXME: APPLE_CC?
-  Out << "#if defined(__GNUC__)\n"
-      << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
-      << "#endif\n\n";
-
-  // Define NaN and Inf as GCC builtins if using GCC, as 0 otherwise
-  // From the GCC documentation:
-  //
-  //   double __builtin_nan (const char *str)
-  //
-  // This is an implementation of the ISO C99 function nan.
-  //
-  // Since ISO C99 defines this function in terms of strtod, which we do
-  // not implement, a description of the parsing is in order. The string is
-  // parsed as by strtol; that is, the base is recognized by leading 0 or
-  // 0x prefixes. The number parsed is placed in the significand such that
-  // the least significant bit of the number is at the least significant
-  // bit of the significand. The number is truncated to fit the significand
-  // field provided. The significand is forced to be a quiet NaN.
-  //
-  // This function, if given a string literal, is evaluated early enough
-  // that it is considered a compile-time constant.
-  //
-  //   float __builtin_nanf (const char *str)
-  //
-  // Similar to __builtin_nan, except the return type is float.
-  //
-  //   double __builtin_inf (void)
-  //
-  // Similar to __builtin_huge_val, except a warning is generated if the
-  // target floating-point format does not support infinities. This
-  // function is suitable for implementing the ISO C99 macro INFINITY.
-  //
-  //   float __builtin_inff (void)
-  //
-  // Similar to __builtin_inf, except the return type is float.
-  Out << "#ifdef __GNUC__\n"
-      << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
-      << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
-      << "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
-      << "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
-      << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
-      << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
-      << "#define LLVM_PREFETCH(addr,rw,locality) "
-                              "__builtin_prefetch(addr,rw,locality)\n"
-      << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
-      << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
-      << "#define LLVM_ASM           __asm__\n"
-      << "#else\n"
-      << "#define LLVM_NAN(NanStr)   ((double)0.0)           /* Double */\n"
-      << "#define LLVM_NANF(NanStr)  0.0F                    /* Float */\n"
-      << "#define LLVM_NANS(NanStr)  ((double)0.0)           /* Double */\n"
-      << "#define LLVM_NANSF(NanStr) 0.0F                    /* Float */\n"
-      << "#define LLVM_INF           ((double)0.0)           /* Double */\n"
-      << "#define LLVM_INFF          0.0F                    /* Float */\n"
-      << "#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
-      << "#define __ATTRIBUTE_CTOR__\n"
-      << "#define __ATTRIBUTE_DTOR__\n"
-      << "#define LLVM_ASM(X)\n"
-      << "#endif\n\n";
-
-  Out << "#if __GNUC__ < 4 /* Old GCC's, or compilers not GCC */ \n"
-      << "#define __builtin_stack_save() 0   /* not implemented */\n"
-      << "#define __builtin_stack_restore(X) /* noop */\n"
-      << "#endif\n\n";
-
-  // Output typedefs for 128-bit integers. If these are needed with a
-  // 32-bit target or with a C compiler that doesn't support mode(TI),
-  // more drastic measures will be needed.
-  Out << "#if __GNUC__ && __LP64__ /* 128-bit integer types */\n"
-      << "typedef int __attribute__((mode(TI))) llvmInt128;\n"
-      << "typedef unsigned __attribute__((mode(TI))) llvmUInt128;\n"
-      << "#endif\n\n";
-
-  // Output target-specific code that should be inserted into main.
-  Out << "#define CODE_FOR_MAIN() /* Any target-specific code for main()*/\n";
-}
-
-/// FindStaticTors - Given a static ctor/dtor list, unpack its contents into
-/// the StaticTors set.
-static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){
-  ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-  if (!InitList) return;
-
-  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
-    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){
-      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.
-
-      if (CS->getOperand(1)->isNullValue())
-        return;  // Found a null terminator, exit printing.
-      Constant *FP = CS->getOperand(1);
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
-        if (CE->isCast())
-          FP = CE->getOperand(0);
-      if (Function *F = dyn_cast<Function>(FP))
-        StaticTors.insert(F);
-    }
-}
-
-enum SpecialGlobalClass {
-  NotSpecial = 0,
-  GlobalCtors, GlobalDtors,
-  NotPrinted
-};
-
-/// getGlobalVariableClass - If this is a global that is specially recognized
-/// by LLVM, return a code that indicates how we should handle it.
-static SpecialGlobalClass getGlobalVariableClass(const GlobalVariable *GV) {
-  // If this is a global ctors/dtors list, handle it now.
-  if (GV->hasAppendingLinkage() && GV->use_empty()) {
-    if (GV->getName() == "llvm.global_ctors")
-      return GlobalCtors;
-    else if (GV->getName() == "llvm.global_dtors")
-      return GlobalDtors;
-  }
-
-  // Otherwise, if it is other metadata, don't print it.  This catches things
-  // like debug information.
-  if (GV->getSection() == "llvm.metadata")
-    return NotPrinted;
-
-  return NotSpecial;
-}
-
-// PrintEscapedString - Print each character of the specified string, escaping
-// it if it is not printable or if it is an escape char.
-static void PrintEscapedString(const char *Str, unsigned Length,
-                               raw_ostream &Out) {
-  for (unsigned i = 0; i != Length; ++i) {
-    unsigned char C = Str[i];
-    if (isprint(C) && C != '\\' && C != '"')
-      Out << C;
-    else if (C == '\\')
-      Out << "\\\\";
-    else if (C == '\"')
-      Out << "\\\"";
-    else if (C == '\t')
-      Out << "\\t";
-    else
-      Out << "\\x" << hexdigit(C >> 4) << hexdigit(C & 0x0F);
-  }
-}
-
-// PrintEscapedString - Print each character of the specified string, escaping
-// it if it is not printable or if it is an escape char.
-static void PrintEscapedString(const std::string &Str, raw_ostream &Out) {
-  PrintEscapedString(Str.c_str(), Str.size(), Out);
-}
-
-bool CWriter::doInitialization(Module &M) {
-  FunctionPass::doInitialization(M);
-
-  // Initialize
-  TheModule = &M;
-
-  TD = new TargetData(&M);
-  IL = new IntrinsicLowering(*TD);
-  IL->AddPrototypes(M);
-
-#if 0
-  std::string Triple = TheModule->getTargetTriple();
-  if (Triple.empty())
-    Triple = llvm::sys::getDefaultTargetTriple();
-
-  std::string E;
-  if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
-    TAsm = Match->createMCAsmInfo(Triple);
-#endif
-  TAsm = new CBEMCAsmInfo();
-  MRI  = new MCRegisterInfo();
-  TCtx = new MCContext(*TAsm, *MRI, NULL);
-  Mang = new Mangler(*TCtx, *TD);
-
-  // Keep track of which functions are static ctors/dtors so they can have
-  // an attribute added to their prototypes.
-  std::set<Function*> StaticCtors, StaticDtors;
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I) {
-    switch (getGlobalVariableClass(I)) {
-    default: break;
-    case GlobalCtors:
-      FindStaticTors(I, StaticCtors);
-      break;
-    case GlobalDtors:
-      FindStaticTors(I, StaticDtors);
-      break;
-    }
-  }
-
-  // get declaration for alloca
-  Out << "/* Provide Declarations */\n";
-  Out << "#include <stdarg.h>\n";      // Varargs support
-  Out << "#include <setjmp.h>\n";      // Unwind support
-  Out << "#include <limits.h>\n";      // With overflow intrinsics support.
-  generateCompilerSpecificCode(Out, TD);
-
-  // Provide a definition for `bool' if not compiling with a C++ compiler.
-  Out << "\n"
-      << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n"
-
-      << "\n\n/* Support for floating point constants */\n"
-      << "typedef unsigned long long ConstantDoubleTy;\n"
-      << "typedef unsigned int        ConstantFloatTy;\n"
-      << "typedef struct { unsigned long long f1; unsigned short f2; "
-         "unsigned short pad[3]; } ConstantFP80Ty;\n"
-      // This is used for both kinds of 128-bit long double; meaning differs.
-      << "typedef struct { unsigned long long f1; unsigned long long f2; }"
-         " ConstantFP128Ty;\n"
-      << "\n\n/* Global Declarations */\n";
-
-  // First output all the declarations for the program, because C requires
-  // Functions & globals to be declared before they are used.
-  //
-  if (!M.getModuleInlineAsm().empty()) {
-    Out << "/* Module asm statements */\n"
-        << "asm(";
-
-    // Split the string into lines, to make it easier to read the .ll file.
-    std::string Asm = M.getModuleInlineAsm();
-    size_t CurPos = 0;
-    size_t NewLine = Asm.find_first_of('\n', CurPos);
-    while (NewLine != std::string::npos) {
-      // We found a newline, print the portion of the asm string from the
-      // last newline up to this newline.
-      Out << "\"";
-      PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine),
-                         Out);
-      Out << "\\n\"\n";
-      CurPos = NewLine+1;
-      NewLine = Asm.find_first_of('\n', CurPos);
-    }
-    Out << "\"";
-    PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.end()), Out);
-    Out << "\");\n"
-        << "/* End Module asm statements */\n";
-  }
-
-  // Loop over the symbol table, emitting all named constants.
-  printModuleTypes();
-
-  // Global variable declarations...
-  if (!M.global_empty()) {
-    Out << "\n/* External Global Variable Declarations */\n";
-    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-         I != E; ++I) {
-
-      if (I->hasExternalLinkage() || I->hasExternalWeakLinkage() ||
-          I->hasCommonLinkage())
-        Out << "extern ";
-      else if (I->hasDLLImportLinkage())
-        Out << "__declspec(dllimport) ";
-      else
-        continue; // Internal Global
-
-      // Thread Local Storage
-      if (I->isThreadLocal())
-        Out << "__thread ";
-
-      printType(Out, I->getType()->getElementType(), false, GetValueName(I));
-
-      if (I->hasExternalWeakLinkage())
-         Out << " __EXTERNAL_WEAK__";
-      Out << ";\n";
-    }
-  }
-
-  // Function declarations
-  Out << "\n/* Function Declarations */\n";
-  Out << "double fmod(double, double);\n";   // Support for FP rem
-  Out << "float fmodf(float, float);\n";
-  Out << "long double fmodl(long double, long double);\n";
-
-  // Store the intrinsics which will be declared/defined below.
-  SmallVector<const Function*, 8> intrinsicsToDefine;
-
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    // Don't print declarations for intrinsic functions.
-    // Store the used intrinsics, which need to be explicitly defined.
-    if (I->isIntrinsic()) {
-      switch (I->getIntrinsicID()) {
-        default:
-          break;
-        case Intrinsic::uadd_with_overflow:
-        case Intrinsic::sadd_with_overflow:
-          intrinsicsToDefine.push_back(I);
-          break;
-      }
-      continue;
-    }
-
-    if (I->getName() == "setjmp" ||
-        I->getName() == "longjmp" || I->getName() == "_setjmp")
-      continue;
-
-    if (I->hasExternalWeakLinkage())
-      Out << "extern ";
-    printFunctionSignature(I, true);
-    if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
-      Out << " __ATTRIBUTE_WEAK__";
-    if (I->hasExternalWeakLinkage())
-      Out << " __EXTERNAL_WEAK__";
-    if (StaticCtors.count(I))
-      Out << " __ATTRIBUTE_CTOR__";
-    if (StaticDtors.count(I))
-      Out << " __ATTRIBUTE_DTOR__";
-    if (I->hasHiddenVisibility())
-      Out << " __HIDDEN__";
-
-    if (I->hasName() && I->getName()[0] == 1)
-      Out << " LLVM_ASM(\"" << I->getName().substr(1) << "\")";
-
-    Out << ";\n";
-  }
-
-  // Output the global variable declarations
-  if (!M.global_empty()) {
-    Out << "\n\n/* Global Variable Declarations */\n";
-    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-         I != E; ++I)
-      if (!I->isDeclaration()) {
-        // Ignore special globals, such as debug info.
-        if (getGlobalVariableClass(I))
-          continue;
-
-        if (I->hasLocalLinkage())
-          Out << "static ";
-        else
-          Out << "extern ";
-
-        // Thread Local Storage
-        if (I->isThreadLocal())
-          Out << "__thread ";
-
-        printType(Out, I->getType()->getElementType(), false,
-                  GetValueName(I));
-
-        if (I->hasLinkOnceLinkage())
-          Out << " __attribute__((common))";
-        else if (I->hasCommonLinkage())     // FIXME is this right?
-          Out << " __ATTRIBUTE_WEAK__";
-        else if (I->hasWeakLinkage())
-          Out << " __ATTRIBUTE_WEAK__";
-        else if (I->hasExternalWeakLinkage())
-          Out << " __EXTERNAL_WEAK__";
-        if (I->hasHiddenVisibility())
-          Out << " __HIDDEN__";
-        Out << ";\n";
-      }
-  }
-
-  // Output the global variable definitions and contents...
-  if (!M.global_empty()) {
-    Out << "\n\n/* Global Variable Definitions and Initialization */\n";
-    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-         I != E; ++I)
-      if (!I->isDeclaration()) {
-        // Ignore special globals, such as debug info.
-        if (getGlobalVariableClass(I))
-          continue;
-
-        if (I->hasLocalLinkage())
-          Out << "static ";
-        else if (I->hasDLLImportLinkage())
-          Out << "__declspec(dllimport) ";
-        else if (I->hasDLLExportLinkage())
-          Out << "__declspec(dllexport) ";
-
-        // Thread Local Storage
-        if (I->isThreadLocal())
-          Out << "__thread ";
-
-        printType(Out, I->getType()->getElementType(), false,
-                  GetValueName(I));
-        if (I->hasLinkOnceLinkage())
-          Out << " __attribute__((common))";
-        else if (I->hasWeakLinkage())
-          Out << " __ATTRIBUTE_WEAK__";
-        else if (I->hasCommonLinkage())
-          Out << " __ATTRIBUTE_WEAK__";
-
-        if (I->hasHiddenVisibility())
-          Out << " __HIDDEN__";
-
-        // If the initializer is not null, emit the initializer.  If it is null,
-        // we try to avoid emitting large amounts of zeros.  The problem with
-        // this, however, occurs when the variable has weak linkage.  In this
-        // case, the assembler will complain about the variable being both weak
-        // and common, so we disable this optimization.
-        // FIXME common linkage should avoid this problem.
-        if (!I->getInitializer()->isNullValue()) {
-          Out << " = " ;
-          writeOperand(I->getInitializer(), true);
-        } else if (I->hasWeakLinkage()) {
-          // We have to specify an initializer, but it doesn't have to be
-          // complete.  If the value is an aggregate, print out { 0 }, and let
-          // the compiler figure out the rest of the zeros.
-          Out << " = " ;
-          if (I->getInitializer()->getType()->isStructTy() ||
-              I->getInitializer()->getType()->isVectorTy()) {
-            Out << "{ 0 }";
-          } else if (I->getInitializer()->getType()->isArrayTy()) {
-            // As with structs and vectors, but with an extra set of braces
-            // because arrays are wrapped in structs.
-            Out << "{ { 0 } }";
-          } else {
-            // Just print it out normally.
-            writeOperand(I->getInitializer(), true);
-          }
-        }
-        Out << ";\n";
-      }
-  }
-
-  if (!M.empty())
-    Out << "\n\n/* Function Bodies */\n";
-
-  // Emit some helper functions for dealing with FCMP instruction's
-  // predicates
-  Out << "static inline int llvm_fcmp_ord(double X, double Y) { ";
-  Out << "return X == X && Y == Y; }\n";
-  Out << "static inline int llvm_fcmp_uno(double X, double Y) { ";
-  Out << "return X != X || Y != Y; }\n";
-  Out << "static inline int llvm_fcmp_ueq(double X, double Y) { ";
-  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "static inline int llvm_fcmp_une(double X, double Y) { ";
-  Out << "return X != Y; }\n";
-  Out << "static inline int llvm_fcmp_ult(double X, double Y) { ";
-  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "static inline int llvm_fcmp_ugt(double X, double Y) { ";
-  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "static inline int llvm_fcmp_ule(double X, double Y) { ";
-  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "static inline int llvm_fcmp_uge(double X, double Y) { ";
-  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
-  Out << "static inline int llvm_fcmp_oeq(double X, double Y) { ";
-  Out << "return X == Y ; }\n";
-  Out << "static inline int llvm_fcmp_one(double X, double Y) { ";
-  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
-  Out << "static inline int llvm_fcmp_olt(double X, double Y) { ";
-  Out << "return X <  Y ; }\n";
-  Out << "static inline int llvm_fcmp_ogt(double X, double Y) { ";
-  Out << "return X >  Y ; }\n";
-  Out << "static inline int llvm_fcmp_ole(double X, double Y) { ";
-  Out << "return X <= Y ; }\n";
-  Out << "static inline int llvm_fcmp_oge(double X, double Y) { ";
-  Out << "return X >= Y ; }\n";
-
-  // Emit definitions of the intrinsics.
-  for (SmallVector<const Function*, 8>::const_iterator
-       I = intrinsicsToDefine.begin(),
-       E = intrinsicsToDefine.end(); I != E; ++I) {
-    printIntrinsicDefinition(**I, Out);
-  }
-
-  return false;
-}
-
-
-/// Output all floating point constants that cannot be printed accurately...
-void CWriter::printFloatingPointConstants(Function &F) {
-  // Scan the module for floating point constants.  If any FP constant is used
-  // in the function, we want to redirect it here so that we do not depend on
-  // the precision of the printed form, unless the printed form preserves
-  // precision.
-  //
-  for (constant_iterator I = constant_begin(&F), E = constant_end(&F);
-       I != E; ++I)
-    printFloatingPointConstants(*I);
-
-  Out << '\n';
-}
-
-void CWriter::printFloatingPointConstants(const Constant *C) {
-  // If this is a constant expression, recursively check for constant fp values.
-  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
-    for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
-      printFloatingPointConstants(CE->getOperand(i));
-    return;
-  }
-
-  // Otherwise, check for a FP constant that we need to print.
-  const ConstantFP *FPC = dyn_cast<ConstantFP>(C);
-  if (FPC == 0 ||
-      // Do not put in FPConstantMap if safe.
-      isFPCSafeToPrint(FPC) ||
-      // Already printed this constant?
-      FPConstantMap.count(FPC))
-    return;
-
-  FPConstantMap[FPC] = FPCounter;  // Number the FP constants
-
-  if (FPC->getType() == Type::getDoubleTy(FPC->getContext())) {
-    double Val = FPC->getValueAPF().convertToDouble();
-    uint64_t i = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
-    Out << "static const ConstantDoubleTy FPConstant" << FPCounter++
-    << " = 0x" << utohexstr(i)
-    << "ULL;    /* " << Val << " */\n";
-  } else if (FPC->getType() == Type::getFloatTy(FPC->getContext())) {
-    float Val = FPC->getValueAPF().convertToFloat();
-    uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().
-    getZExtValue();
-    Out << "static const ConstantFloatTy FPConstant" << FPCounter++
-    << " = 0x" << utohexstr(i)
-    << "U;    /* " << Val << " */\n";
-  } else if (FPC->getType() == Type::getX86_FP80Ty(FPC->getContext())) {
-    // api needed to prevent premature destruction
-    APInt api = FPC->getValueAPF().bitcastToAPInt();
-    const uint64_t *p = api.getRawData();
-    Out << "static const ConstantFP80Ty FPConstant" << FPCounter++
-    << " = { 0x" << utohexstr(p[0])
-    << "ULL, 0x" << utohexstr((uint16_t)p[1]) << ",{0,0,0}"
-    << "}; /* Long double constant */\n";
-  } else if (FPC->getType() == Type::getPPC_FP128Ty(FPC->getContext()) ||
-             FPC->getType() == Type::getFP128Ty(FPC->getContext())) {
-    APInt api = FPC->getValueAPF().bitcastToAPInt();
-    const uint64_t *p = api.getRawData();
-    Out << "static const ConstantFP128Ty FPConstant" << FPCounter++
-    << " = { 0x"
-    << utohexstr(p[0]) << ", 0x" << utohexstr(p[1])
-    << "}; /* Long double constant */\n";
-
-  } else {
-    llvm_unreachable("Unknown float type!");
-  }
-}
-
-
-/// printSymbolTable - Run through symbol table looking for type names.  If a
-/// type name is found, emit its declaration...
-///
-void CWriter::printModuleTypes() {
-  Out << "/* Helper union for bitcasts */\n";
-  Out << "typedef union {\n";
-  Out << "  unsigned int Int32;\n";
-  Out << "  unsigned long long Int64;\n";
-  Out << "  float Float;\n";
-  Out << "  double Double;\n";
-  Out << "} llvmBitCastUnion;\n";
-
-  // Get all of the struct types used in the module.
-  std::vector<StructType*> StructTypes;
-  TheModule->findUsedStructTypes(StructTypes);
-
-  if (StructTypes.empty()) return;
-
-  Out << "/* Structure forward decls */\n";
-
-  unsigned NextTypeID = 0;
-  
-  // If any of them are missing names, add a unique ID to UnnamedStructIDs.
-  // Print out forward declarations for structure types.
-  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
-    StructType *ST = StructTypes[i];
-
-    if (ST->isLiteral() || ST->getName().empty())
-      UnnamedStructIDs[ST] = NextTypeID++;
-
-    std::string Name = getStructName(ST);
-
-    Out << "typedef struct " << Name << ' ' << Name << ";\n";
-  }
-
-  Out << '\n';
-
-  // Keep track of which structures have been printed so far.
-  SmallPtrSet<Type *, 16> StructPrinted;
-
-  // Loop over all structures then push them into the stack so they are
-  // printed in the correct order.
-  //
-  Out << "/* Structure contents */\n";
-  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i)
-    if (StructTypes[i]->isStructTy())
-      // Only print out used types!
-      printContainedStructs(StructTypes[i], StructPrinted);
-}
-
-// Push the struct onto the stack and recursively push all structs
-// this one depends on.
-//
-// TODO:  Make this work properly with vector types
-//
-void CWriter::printContainedStructs(Type *Ty,
-                                SmallPtrSet<Type *, 16> &StructPrinted) {
-  // Don't walk through pointers.
-  if (Ty->isPointerTy() || Ty->isPrimitiveType() || Ty->isIntegerTy())
-    return;
-
-  // Print all contained types first.
-  for (Type::subtype_iterator I = Ty->subtype_begin(),
-       E = Ty->subtype_end(); I != E; ++I)
-    printContainedStructs(*I, StructPrinted);
-
-  if (StructType *ST = dyn_cast<StructType>(Ty)) {
-    // Check to see if we have already printed this struct.
-    if (!StructPrinted.insert(Ty)) return;
-    
-    // Print structure type out.
-    printType(Out, ST, false, getStructName(ST), true);
-    Out << ";\n\n";
-  }
-}
-
-void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
-  /// isStructReturn - Should this function actually return a struct by-value?
-  bool isStructReturn = F->hasStructRetAttr();
-
-  if (F->hasLocalLinkage()) Out << "static ";
-  if (F->hasDLLImportLinkage()) Out << "__declspec(dllimport) ";
-  if (F->hasDLLExportLinkage()) Out << "__declspec(dllexport) ";
-  switch (F->getCallingConv()) {
-   case CallingConv::X86_StdCall:
-    Out << "__attribute__((stdcall)) ";
-    break;
-   case CallingConv::X86_FastCall:
-    Out << "__attribute__((fastcall)) ";
-    break;
-   case CallingConv::X86_ThisCall:
-    Out << "__attribute__((thiscall)) ";
-    break;
-   default:
-    break;
-  }
-
-  // Loop over the arguments, printing them...
-  FunctionType *FT = cast<FunctionType>(F->getFunctionType());
-  const AttrListPtr &PAL = F->getAttributes();
-
-  std::string tstr;
-  raw_string_ostream FunctionInnards(tstr);
-
-  // Print out the name...
-  FunctionInnards << GetValueName(F) << '(';
-
-  bool PrintedArg = false;
-  if (!F->isDeclaration()) {
-    if (!F->arg_empty()) {
-      Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-      unsigned Idx = 1;
-
-      // If this is a struct-return function, don't print the hidden
-      // struct-return argument.
-      if (isStructReturn) {
-        assert(I != E && "Invalid struct return function!");
-        ++I;
-        ++Idx;
-      }
-
-      std::string ArgName;
-      for (; I != E; ++I) {
-        if (PrintedArg) FunctionInnards << ", ";
-        if (I->hasName() || !Prototype)
-          ArgName = GetValueName(I);
-        else
-          ArgName = "";
-        Type *ArgTy = I->getType();
-        if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-          ArgTy = cast<PointerType>(ArgTy)->getElementType();
-          ByValParams.insert(I);
-        }
-        printType(FunctionInnards, ArgTy,
-            /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt),
-            ArgName);
-        PrintedArg = true;
-        ++Idx;
-      }
-    }
-  } else {
-    // Loop over the arguments, printing them.
-    FunctionType::param_iterator I = FT->param_begin(), E = FT->param_end();
-    unsigned Idx = 1;
-
-    // If this is a struct-return function, don't print the hidden
-    // struct-return argument.
-    if (isStructReturn) {
-      assert(I != E && "Invalid struct return function!");
-      ++I;
-      ++Idx;
-    }
-
-    for (; I != E; ++I) {
-      if (PrintedArg) FunctionInnards << ", ";
-      Type *ArgTy = *I;
-      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
-        assert(ArgTy->isPointerTy());
-        ArgTy = cast<PointerType>(ArgTy)->getElementType();
-      }
-      printType(FunctionInnards, ArgTy,
-             /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt));
-      PrintedArg = true;
-      ++Idx;
-    }
-  }
-
-  if (!PrintedArg && FT->isVarArg()) {
-    FunctionInnards << "int vararg_dummy_arg";
-    PrintedArg = true;
-  }
-
-  // Finish printing arguments... if this is a vararg function, print the ...,
-  // unless there are no known types, in which case, we just emit ().
-  //
-  if (FT->isVarArg() && PrintedArg) {
-    FunctionInnards << ",...";  // Output varargs portion of signature!
-  } else if (!FT->isVarArg() && !PrintedArg) {
-    FunctionInnards << "void"; // ret() -> ret(void) in C.
-  }
-  FunctionInnards << ')';
-
-  // Get the return tpe for the function.
-  Type *RetTy;
-  if (!isStructReturn)
-    RetTy = F->getReturnType();
-  else {
-    // If this is a struct-return function, print the struct-return type.
-    RetTy = cast<PointerType>(FT->getParamType(0))->getElementType();
-  }
-
-  // Print out the return type and the signature built above.
-  printType(Out, RetTy,
-            /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt),
-            FunctionInnards.str());
-}
-
-static inline bool isFPIntBitCast(const Instruction &I) {
-  if (!isa<BitCastInst>(I))
-    return false;
-  Type *SrcTy = I.getOperand(0)->getType();
-  Type *DstTy = I.getType();
-  return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
-         (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
-}
-
-void CWriter::printFunction(Function &F) {
-  /// isStructReturn - Should this function actually return a struct by-value?
-  bool isStructReturn = F.hasStructRetAttr();
-
-  printFunctionSignature(&F, false);
-  Out << " {\n";
-
-  // If this is a struct return function, handle the result with magic.
-  if (isStructReturn) {
-    Type *StructTy =
-      cast<PointerType>(F.arg_begin()->getType())->getElementType();
-    Out << "  ";
-    printType(Out, StructTy, false, "StructReturn");
-    Out << ";  /* Struct return temporary */\n";
-
-    Out << "  ";
-    printType(Out, F.arg_begin()->getType(), false,
-              GetValueName(F.arg_begin()));
-    Out << " = &StructReturn;\n";
-  }
-
-  bool PrintedVar = false;
-
-  // print local variable information for the function
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
-    if (const AllocaInst *AI = isDirectAlloca(&*I)) {
-      Out << "  ";
-      printType(Out, AI->getAllocatedType(), false, GetValueName(AI));
-      Out << ";    /* Address-exposed local */\n";
-      PrintedVar = true;
-    } else if (I->getType() != Type::getVoidTy(F.getContext()) &&
-               !isInlinableInst(*I)) {
-      Out << "  ";
-      printType(Out, I->getType(), false, GetValueName(&*I));
-      Out << ";\n";
-
-      if (isa<PHINode>(*I)) {  // Print out PHI node temporaries as well...
-        Out << "  ";
-        printType(Out, I->getType(), false,
-                  GetValueName(&*I)+"__PHI_TEMPORARY");
-        Out << ";\n";
-      }
-      PrintedVar = true;
-    }
-    // We need a temporary for the BitCast to use so it can pluck a value out
-    // of a union to do the BitCast. This is separate from the need for a
-    // variable to hold the result of the BitCast.
-    if (isFPIntBitCast(*I)) {
-      Out << "  llvmBitCastUnion " << GetValueName(&*I)
-          << "__BITCAST_TEMPORARY;\n";
-      PrintedVar = true;
-    }
-  }
-
-  if (PrintedVar)
-    Out << '\n';
-
-  if (F.hasExternalLinkage() && F.getName() == "main")
-    Out << "  CODE_FOR_MAIN();\n";
-
-  // print the basic blocks
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Loop *L = LI->getLoopFor(BB)) {
-      if (L->getHeader() == BB && L->getParentLoop() == 0)
-        printLoop(L);
-    } else {
-      printBasicBlock(BB);
-    }
-  }
-
-  Out << "}\n\n";
-}
-
-void CWriter::printLoop(Loop *L) {
-  Out << "  do {     /* Syntactic loop '" << L->getHeader()->getName()
-      << "' to make GCC happy */\n";
-  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
-    BasicBlock *BB = L->getBlocks()[i];
-    Loop *BBLoop = LI->getLoopFor(BB);
-    if (BBLoop == L)
-      printBasicBlock(BB);
-    else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
-      printLoop(BBLoop);
-  }
-  Out << "  } while (1); /* end of syntactic loop '"
-      << L->getHeader()->getName() << "' */\n";
-}
-
-void CWriter::printBasicBlock(BasicBlock *BB) {
-
-  // Don't print the label for the basic block if there are no uses, or if
-  // the only terminator use is the predecessor basic block's terminator.
-  // We have to scan the use list because PHI nodes use basic blocks too but
-  // do not require a label to be generated.
-  //
-  bool NeedsLabel = false;
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
-    if (isGotoCodeNecessary(*PI, BB)) {
-      NeedsLabel = true;
-      break;
-    }
-
-  if (NeedsLabel) Out << GetValueName(BB) << ":\n";
-
-  // Output all of the instructions in the basic block...
-  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E;
-       ++II) {
-    if (!isInlinableInst(*II) && !isDirectAlloca(II)) {
-      if (II->getType() != Type::getVoidTy(BB->getContext()) &&
-          !isInlineAsm(*II))
-        outputLValue(II);
-      else
-        Out << "  ";
-      writeInstComputationInline(*II);
-      Out << ";\n";
-    }
-  }
-
-  // Don't emit prefix or suffix for the terminator.
-  visit(*BB->getTerminator());
-}
-
-
-// Specific Instruction type classes... note that all of the casts are
-// necessary because we use the instruction classes as opaque types...
-//
-void CWriter::visitReturnInst(ReturnInst &I) {
-  // If this is a struct return function, return the temporary struct.
-  bool isStructReturn = I.getParent()->getParent()->hasStructRetAttr();
-
-  if (isStructReturn) {
-    Out << "  return StructReturn;\n";
-    return;
-  }
-
-  // Don't output a void return if this is the last basic block in the function
-  if (I.getNumOperands() == 0 &&
-      &*--I.getParent()->getParent()->end() == I.getParent() &&
-      !I.getParent()->size() == 1) {
-    return;
-  }
-
-  Out << "  return";
-  if (I.getNumOperands()) {
-    Out << ' ';
-    writeOperand(I.getOperand(0));
-  }
-  Out << ";\n";
-}
-
-void CWriter::visitSwitchInst(SwitchInst &SI) {
-
-  Value* Cond = SI.getCondition();
-
-  Out << "  switch (";
-  writeOperand(Cond);
-  Out << ") {\n  default:\n";
-  printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
-  printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
-  Out << ";\n";
-
-  // Skip the first item since that's the default case.
-  for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
-    ConstantInt* CaseVal = i.getCaseValue();
-    BasicBlock* Succ = i.getCaseSuccessor();
-    Out << "  case ";
-    writeOperand(CaseVal);
-    Out << ":\n";
-    printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
-    printBranchToBlock(SI.getParent(), Succ, 2);
-    if (Function::iterator(Succ) ==
-        llvm::next(Function::iterator(SI.getParent())))
-      Out << "    break;\n";
-  }
-
-  Out << "  }\n";
-}
-
-void CWriter::visitIndirectBrInst(IndirectBrInst &IBI) {
-  Out << "  goto *(void*)(";
-  writeOperand(IBI.getOperand(0));
-  Out << ");\n";
-}
-
-void CWriter::visitUnreachableInst(UnreachableInst &I) {
-  Out << "  /*UNREACHABLE*/;\n";
-}
-
-bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) {
-  /// FIXME: This should be reenabled, but loop reordering safe!!
-  return true;
-
-  if (llvm::next(Function::iterator(From)) != Function::iterator(To))
-    return true;  // Not the direct successor, we need a goto.
-
-  //isa<SwitchInst>(From->getTerminator())
-
-  if (LI->getLoopFor(From) != LI->getLoopFor(To))
-    return true;
-  return false;
-}
-
-void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock,
-                                          BasicBlock *Successor,
-                                          unsigned Indent) {
-  for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    // Now we have to do the printing.
-    Value *IV = PN->getIncomingValueForBlock(CurBlock);
-    if (!isa<UndefValue>(IV)) {
-      Out << std::string(Indent, ' ');
-      Out << "  " << GetValueName(I) << "__PHI_TEMPORARY = ";
-      writeOperand(IV);
-      Out << ";   /* for PHI node */\n";
-    }
-  }
-}
-
-void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ,
-                                 unsigned Indent) {
-  if (isGotoCodeNecessary(CurBB, Succ)) {
-    Out << std::string(Indent, ' ') << "  goto ";
-    writeOperand(Succ);
-    Out << ";\n";
-  }
-}
-
-// Branch instruction printing - Avoid printing out a branch to a basic block
-// that immediately succeeds the current one.
-//
-void CWriter::visitBranchInst(BranchInst &I) {
-
-  if (I.isConditional()) {
-    if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(0))) {
-      Out << "  if (";
-      writeOperand(I.getCondition());
-      Out << ") {\n";
-
-      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 2);
-      printBranchToBlock(I.getParent(), I.getSuccessor(0), 2);
-
-      if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(1))) {
-        Out << "  } else {\n";
-        printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
-        printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
-      }
-    } else {
-      // First goto not necessary, assume second one is...
-      Out << "  if (!";
-      writeOperand(I.getCondition());
-      Out << ") {\n";
-
-      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
-      printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
-    }
-
-    Out << "  }\n";
-  } else {
-    printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 0);
-    printBranchToBlock(I.getParent(), I.getSuccessor(0), 0);
-  }
-  Out << "\n";
-}
-
-// PHI nodes get copied into temporary values at the end of predecessor basic
-// blocks.  We now need to copy these temporary values into the REAL value for
-// the PHI.
-void CWriter::visitPHINode(PHINode &I) {
-  writeOperand(&I);
-  Out << "__PHI_TEMPORARY";
-}
-
-
-void CWriter::visitBinaryOperator(Instruction &I) {
-  // binary instructions, shift instructions, setCond instructions.
-  assert(!I.getType()->isPointerTy());
-
-  // We must cast the results of binary operations which might be promoted.
-  bool needsCast = false;
-  if ((I.getType() == Type::getInt8Ty(I.getContext())) ||
-      (I.getType() == Type::getInt16Ty(I.getContext()))
-      || (I.getType() == Type::getFloatTy(I.getContext()))) {
-    needsCast = true;
-    Out << "((";
-    printType(Out, I.getType(), false);
-    Out << ")(";
-  }
-
-  // If this is a negation operation, print it out as such.  For FP, we don't
-  // want to print "-0.0 - X".
-  if (BinaryOperator::isNeg(&I)) {
-    Out << "-(";
-    writeOperand(BinaryOperator::getNegArgument(cast<BinaryOperator>(&I)));
-    Out << ")";
-  } else if (BinaryOperator::isFNeg(&I)) {
-    Out << "-(";
-    writeOperand(BinaryOperator::getFNegArgument(cast<BinaryOperator>(&I)));
-    Out << ")";
-  } else if (I.getOpcode() == Instruction::FRem) {
-    // Output a call to fmod/fmodf instead of emitting a%b
-    if (I.getType() == Type::getFloatTy(I.getContext()))
-      Out << "fmodf(";
-    else if (I.getType() == Type::getDoubleTy(I.getContext()))
-      Out << "fmod(";
-    else  // all 3 flavors of long double
-      Out << "fmodl(";
-    writeOperand(I.getOperand(0));
-    Out << ", ";
-    writeOperand(I.getOperand(1));
-    Out << ")";
-  } else {
-
-    // Write out the cast of the instruction's value back to the proper type
-    // if necessary.
-    bool NeedsClosingParens = writeInstructionCast(I);
-
-    // Certain instructions require the operand to be forced to a specific type
-    // so we use writeOperandWithCast here instead of writeOperand. Similarly
-    // below for operand 1
-    writeOperandWithCast(I.getOperand(0), I.getOpcode());
-
-    switch (I.getOpcode()) {
-    case Instruction::Add:
-    case Instruction::FAdd: Out << " + "; break;
-    case Instruction::Sub:
-    case Instruction::FSub: Out << " - "; break;
-    case Instruction::Mul:
-    case Instruction::FMul: Out << " * "; break;
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem: Out << " % "; break;
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv: Out << " / "; break;
-    case Instruction::And:  Out << " & "; break;
-    case Instruction::Or:   Out << " | "; break;
-    case Instruction::Xor:  Out << " ^ "; break;
-    case Instruction::Shl : Out << " << "; break;
-    case Instruction::LShr:
-    case Instruction::AShr: Out << " >> "; break;
-    default:
-#ifndef NDEBUG
-       errs() << "Invalid operator type!" << I;
-#endif
-       llvm_unreachable(0);
-    }
-
-    writeOperandWithCast(I.getOperand(1), I.getOpcode());
-    if (NeedsClosingParens)
-      Out << "))";
-  }
-
-  if (needsCast) {
-    Out << "))";
-  }
-}
-
-void CWriter::visitICmpInst(ICmpInst &I) {
-  // We must cast the results of icmp which might be promoted.
-  bool needsCast = false;
-
-  // Write out the cast of the instruction's value back to the proper type
-  // if necessary.
-  bool NeedsClosingParens = writeInstructionCast(I);
-
-  // Certain icmp predicate require the operand to be forced to a specific type
-  // so we use writeOperandWithCast here instead of writeOperand. Similarly
-  // below for operand 1
-  writeOperandWithCast(I.getOperand(0), I);
-
-  switch (I.getPredicate()) {
-  case ICmpInst::ICMP_EQ:  Out << " == "; break;
-  case ICmpInst::ICMP_NE:  Out << " != "; break;
-  case ICmpInst::ICMP_ULE:
-  case ICmpInst::ICMP_SLE: Out << " <= "; break;
-  case ICmpInst::ICMP_UGE:
-  case ICmpInst::ICMP_SGE: Out << " >= "; break;
-  case ICmpInst::ICMP_ULT:
-  case ICmpInst::ICMP_SLT: Out << " < "; break;
-  case ICmpInst::ICMP_UGT:
-  case ICmpInst::ICMP_SGT: Out << " > "; break;
-  default:
-#ifndef NDEBUG
-    errs() << "Invalid icmp predicate!" << I;
-#endif
-    llvm_unreachable(0);
-  }
-
-  writeOperandWithCast(I.getOperand(1), I);
-  if (NeedsClosingParens)
-    Out << "))";
-
-  if (needsCast) {
-    Out << "))";
-  }
-}
-
-void CWriter::visitFCmpInst(FCmpInst &I) {
-  if (I.getPredicate() == FCmpInst::FCMP_FALSE) {
-    Out << "0";
-    return;
-  }
-  if (I.getPredicate() == FCmpInst::FCMP_TRUE) {
-    Out << "1";
-    return;
-  }
-
-  const char* op = 0;
-  switch (I.getPredicate()) {
-  default: llvm_unreachable("Illegal FCmp predicate");
-  case FCmpInst::FCMP_ORD: op = "ord"; break;
-  case FCmpInst::FCMP_UNO: op = "uno"; break;
-  case FCmpInst::FCMP_UEQ: op = "ueq"; break;
-  case FCmpInst::FCMP_UNE: op = "une"; break;
-  case FCmpInst::FCMP_ULT: op = "ult"; break;
-  case FCmpInst::FCMP_ULE: op = "ule"; break;
-  case FCmpInst::FCMP_UGT: op = "ugt"; break;
-  case FCmpInst::FCMP_UGE: op = "uge"; break;
-  case FCmpInst::FCMP_OEQ: op = "oeq"; break;
-  case FCmpInst::FCMP_ONE: op = "one"; break;
-  case FCmpInst::FCMP_OLT: op = "olt"; break;
-  case FCmpInst::FCMP_OLE: op = "ole"; break;
-  case FCmpInst::FCMP_OGT: op = "ogt"; break;
-  case FCmpInst::FCMP_OGE: op = "oge"; break;
-  }
-
-  Out << "llvm_fcmp_" << op << "(";
-  // Write the first operand
-  writeOperand(I.getOperand(0));
-  Out << ", ";
-  // Write the second operand
-  writeOperand(I.getOperand(1));
-  Out << ")";
-}
-
-static const char * getFloatBitCastField(Type *Ty) {
-  switch (Ty->getTypeID()) {
-    default: llvm_unreachable("Invalid Type");
-    case Type::FloatTyID:  return "Float";
-    case Type::DoubleTyID: return "Double";
-    case Type::IntegerTyID: {
-      unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-      if (NumBits <= 32)
-        return "Int32";
-      else
-        return "Int64";
-    }
-  }
-}
-
-void CWriter::visitCastInst(CastInst &I) {
-  Type *DstTy = I.getType();
-  Type *SrcTy = I.getOperand(0)->getType();
-  if (isFPIntBitCast(I)) {
-    Out << '(';
-    // These int<->float and long<->double casts need to be handled specially
-    Out << GetValueName(&I) << "__BITCAST_TEMPORARY."
-        << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
-    writeOperand(I.getOperand(0));
-    Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
-        << getFloatBitCastField(I.getType());
-    Out << ')';
-    return;
-  }
-
-  Out << '(';
-  printCast(I.getOpcode(), SrcTy, DstTy);
-
-  // Make a sext from i1 work by subtracting the i1 from 0 (an int).
-  if (SrcTy == Type::getInt1Ty(I.getContext()) &&
-      I.getOpcode() == Instruction::SExt)
-    Out << "0-";
-
-  writeOperand(I.getOperand(0));
-
-  if (DstTy == Type::getInt1Ty(I.getContext()) &&
-      (I.getOpcode() == Instruction::Trunc ||
-       I.getOpcode() == Instruction::FPToUI ||
-       I.getOpcode() == Instruction::FPToSI ||
-       I.getOpcode() == Instruction::PtrToInt)) {
-    // Make sure we really get a trunc to bool by anding the operand with 1
-    Out << "&1u";
-  }
-  Out << ')';
-}
-
-void CWriter::visitSelectInst(SelectInst &I) {
-  Out << "((";
-  writeOperand(I.getCondition());
-  Out << ") ? (";
-  writeOperand(I.getTrueValue());
-  Out << ") : (";
-  writeOperand(I.getFalseValue());
-  Out << "))";
-}
-
-// Returns the macro name or value of the max or min of an integer type
-// (as defined in limits.h).
-static void printLimitValue(IntegerType &Ty, bool isSigned, bool isMax,
-                            raw_ostream &Out) {
-  const char* type;
-  const char* sprefix = "";
-
-  unsigned NumBits = Ty.getBitWidth();
-  if (NumBits <= 8) {
-    type = "CHAR";
-    sprefix = "S";
-  } else if (NumBits <= 16) {
-    type = "SHRT";
-  } else if (NumBits <= 32) {
-    type = "INT";
-  } else if (NumBits <= 64) {
-    type = "LLONG";
-  } else {
-    llvm_unreachable("Bit widths > 64 not implemented yet");
-  }
-
-  if (isSigned)
-    Out << sprefix << type << (isMax ? "_MAX" : "_MIN");
-  else
-    Out << "U" << type << (isMax ? "_MAX" : "0");
-}
-
-#ifndef NDEBUG
-static bool isSupportedIntegerSize(IntegerType &T) {
-  return T.getBitWidth() == 8 || T.getBitWidth() == 16 ||
-         T.getBitWidth() == 32 || T.getBitWidth() == 64;
-}
-#endif
-
-void CWriter::printIntrinsicDefinition(const Function &F, raw_ostream &Out) {
-  FunctionType *funT = F.getFunctionType();
-  Type *retT = F.getReturnType();
-  IntegerType *elemT = cast<IntegerType>(funT->getParamType(1));
-
-  assert(isSupportedIntegerSize(*elemT) &&
-         "CBackend does not support arbitrary size integers.");
-  assert(cast<StructType>(retT)->getElementType(0) == elemT &&
-         elemT == funT->getParamType(0) && funT->getNumParams() == 2);
-
-  switch (F.getIntrinsicID()) {
-  default:
-    llvm_unreachable("Unsupported Intrinsic.");
-  case Intrinsic::uadd_with_overflow:
-    // static inline Rty uadd_ixx(unsigned ixx a, unsigned ixx b) {
-    //   Rty r;
-    //   r.field0 = a + b;
-    //   r.field1 = (r.field0 < a);
-    //   return r;
-    // }
-    Out << "static inline ";
-    printType(Out, retT);
-    Out << GetValueName(&F);
-    Out << "(";
-    printSimpleType(Out, elemT, false);
-    Out << "a,";
-    printSimpleType(Out, elemT, false);
-    Out << "b) {\n  ";
-    printType(Out, retT);
-    Out << "r;\n";
-    Out << "  r.field0 = a + b;\n";
-    Out << "  r.field1 = (r.field0 < a);\n";
-    Out << "  return r;\n}\n";
-    break;
-    
-  case Intrinsic::sadd_with_overflow:            
-    // static inline Rty sadd_ixx(ixx a, ixx b) {
-    //   Rty r;
-    //   r.field1 = (b > 0 && a > XX_MAX - b) ||
-    //              (b < 0 && a < XX_MIN - b);
-    //   r.field0 = r.field1 ? 0 : a + b;
-    //   return r;
-    // }
-    Out << "static ";
-    printType(Out, retT);
-    Out << GetValueName(&F);
-    Out << "(";
-    printSimpleType(Out, elemT, true);
-    Out << "a,";
-    printSimpleType(Out, elemT, true);
-    Out << "b) {\n  ";
-    printType(Out, retT);
-    Out << "r;\n";
-    Out << "  r.field1 = (b > 0 && a > ";
-    printLimitValue(*elemT, true, true, Out);
-    Out << " - b) || (b < 0 && a < ";
-    printLimitValue(*elemT, true, false, Out);
-    Out << " - b);\n";
-    Out << "  r.field0 = r.field1 ? 0 : a + b;\n";
-    Out << "  return r;\n}\n";
-    break;
-  }
-}
-
-void CWriter::lowerIntrinsics(Function &F) {
-  // This is used to keep track of intrinsics that get generated to a lowered
-  // function. We must generate the prototypes before the function body which
-  // will only be expanded on first use (by the loop below).
-  std::vector<Function*> prototypesToGen;
-
-  // Examine all the instructions in this function to find the intrinsics that
-  // need to be lowered.
-  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB)
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
-      if (CallInst *CI = dyn_cast<CallInst>(I++))
-        if (Function *F = CI->getCalledFunction())
-          switch (F->getIntrinsicID()) {
-          case Intrinsic::not_intrinsic:
-          case Intrinsic::vastart:
-          case Intrinsic::vacopy:
-          case Intrinsic::vaend:
-          case Intrinsic::returnaddress:
-          case Intrinsic::frameaddress:
-          case Intrinsic::setjmp:
-          case Intrinsic::longjmp:
-          case Intrinsic::prefetch:
-          case Intrinsic::powi:
-          case Intrinsic::x86_sse_cmp_ss:
-          case Intrinsic::x86_sse_cmp_ps:
-          case Intrinsic::x86_sse2_cmp_sd:
-          case Intrinsic::x86_sse2_cmp_pd:
-          case Intrinsic::ppc_altivec_lvsl:
-          case Intrinsic::uadd_with_overflow:
-          case Intrinsic::sadd_with_overflow:
-              // We directly implement these intrinsics
-            break;
-          default:
-            // If this is an intrinsic that directly corresponds to a GCC
-            // builtin, we handle it.
-            const char *BuiltinName = "";
-#define GET_GCC_BUILTIN_NAME
-#include "llvm/Intrinsics.gen"
-#undef GET_GCC_BUILTIN_NAME
-            // If we handle it, don't lower it.
-            if (BuiltinName[0]) break;
-
-            // All other intrinsic calls we must lower.
-            Instruction *Before = 0;
-            if (CI != &BB->front())
-              Before = prior(BasicBlock::iterator(CI));
-
-            IL->LowerIntrinsicCall(CI);
-            if (Before) {        // Move iterator to instruction after call
-              I = Before; ++I;
-            } else {
-              I = BB->begin();
-            }
-            // If the intrinsic got lowered to another call, and that call has
-            // a definition then we need to make sure its prototype is emitted
-            // before any calls to it.
-            if (CallInst *Call = dyn_cast<CallInst>(I))
-              if (Function *NewF = Call->getCalledFunction())
-                if (!NewF->isDeclaration())
-                  prototypesToGen.push_back(NewF);
-
-            break;
-          }
-
-  // We may have collected some prototypes to emit in the loop above.
-  // Emit them now, before the function that uses them is emitted. But,
-  // be careful not to emit them twice.
-  std::vector<Function*>::iterator I = prototypesToGen.begin();
-  std::vector<Function*>::iterator E = prototypesToGen.end();
-  for ( ; I != E; ++I) {
-    if (intrinsicPrototypesAlreadyGenerated.insert(*I).second) {
-      Out << '\n';
-      printFunctionSignature(*I, true);
-      Out << ";\n";
-    }
-  }
-}
-
-void CWriter::visitCallInst(CallInst &I) {
-  if (isa<InlineAsm>(I.getCalledValue()))
-    return visitInlineAsm(I);
-
-  bool WroteCallee = false;
-
-  // Handle intrinsic function calls first...
-  if (Function *F = I.getCalledFunction())
-    if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
-      if (visitBuiltinCall(I, ID, WroteCallee))
-        return;
-
-  Value *Callee = I.getCalledValue();
-
-  PointerType  *PTy   = cast<PointerType>(Callee->getType());
-  FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
-
-  // If this is a call to a struct-return function, assign to the first
-  // parameter instead of passing it to the call.
-  const AttrListPtr &PAL = I.getAttributes();
-  bool hasByVal = I.hasByValArgument();
-  bool isStructRet = I.hasStructRetAttr();
-  if (isStructRet) {
-    writeOperandDeref(I.getArgOperand(0));
-    Out << " = ";
-  }
-
-  if (I.isTailCall()) Out << " /*tail*/ ";
-
-  if (!WroteCallee) {
-    // If this is an indirect call to a struct return function, we need to cast
-    // the pointer. Ditto for indirect calls with byval arguments.
-    bool NeedsCast = (hasByVal || isStructRet) && !isa<Function>(Callee);
-
-    // GCC is a real PITA.  It does not permit codegening casts of functions to
-    // function pointers if they are in a call (it generates a trap instruction
-    // instead!).  We work around this by inserting a cast to void* in between
-    // the function and the function pointer cast.  Unfortunately, we can't just
-    // form the constant expression here, because the folder will immediately
-    // nuke it.
-    //
-    // Note finally, that this is completely unsafe.  ANSI C does not guarantee
-    // that void* and function pointers have the same size. :( To deal with this
-    // in the common case, we handle casts where the number of arguments passed
-    // match exactly.
-    //
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Callee))
-      if (CE->isCast())
-        if (Function *RF = dyn_cast<Function>(CE->getOperand(0))) {
-          NeedsCast = true;
-          Callee = RF;
-        }
-
-    if (NeedsCast) {
-      // Ok, just cast the pointer type.
-      Out << "((";
-      if (isStructRet)
-        printStructReturnPointerFunctionType(Out, PAL,
-                             cast<PointerType>(I.getCalledValue()->getType()));
-      else if (hasByVal)
-        printType(Out, I.getCalledValue()->getType(), false, "", true, PAL);
-      else
-        printType(Out, I.getCalledValue()->getType());
-      Out << ")(void*)";
-    }
-    writeOperand(Callee);
-    if (NeedsCast) Out << ')';
-  }
-
-  Out << '(';
-
-  bool PrintedArg = false;
-  if(FTy->isVarArg() && !FTy->getNumParams()) {
-    Out << "0 /*dummy arg*/";
-    PrintedArg = true;
-  }
-
-  unsigned NumDeclaredParams = FTy->getNumParams();
-  CallSite CS(&I);
-  CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
-  unsigned ArgNo = 0;
-  if (isStructRet) {   // Skip struct return argument.
-    ++AI;
-    ++ArgNo;
-  }
-
-
-  for (; AI != AE; ++AI, ++ArgNo) {
-    if (PrintedArg) Out << ", ";
-    if (ArgNo < NumDeclaredParams &&
-        (*AI)->getType() != FTy->getParamType(ArgNo)) {
-      Out << '(';
-      printType(Out, FTy->getParamType(ArgNo),
-            /*isSigned=*/PAL.paramHasAttr(ArgNo+1, Attribute::SExt));
-      Out << ')';
-    }
-    // Check if the argument is expected to be passed by value.
-    if (I.paramHasAttr(ArgNo+1, Attribute::ByVal))
-      writeOperandDeref(*AI);
-    else
-      writeOperand(*AI);
-    PrintedArg = true;
-  }
-  Out << ')';
-}
-
-/// visitBuiltinCall - Handle the call to the specified builtin.  Returns true
-/// if the entire call is handled, return false if it wasn't handled, and
-/// optionally set 'WroteCallee' if the callee has already been printed out.
-bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
-                               bool &WroteCallee) {
-  switch (ID) {
-  default: {
-    // If this is an intrinsic that directly corresponds to a GCC
-    // builtin, we emit it here.
-    const char *BuiltinName = "";
-    Function *F = I.getCalledFunction();
-#define GET_GCC_BUILTIN_NAME
-#include "llvm/Intrinsics.gen"
-#undef GET_GCC_BUILTIN_NAME
-    assert(BuiltinName[0] && "Unknown LLVM intrinsic!");
-
-    Out << BuiltinName;
-    WroteCallee = true;
-    return false;
-  }
-  case Intrinsic::vastart:
-    Out << "0; ";
-
-    Out << "va_start(*(va_list*)";
-    writeOperand(I.getArgOperand(0));
-    Out << ", ";
-    // Output the last argument to the enclosing function.
-    if (I.getParent()->getParent()->arg_empty())
-      Out << "vararg_dummy_arg";
-    else
-      writeOperand(--I.getParent()->getParent()->arg_end());
-    Out << ')';
-    return true;
-  case Intrinsic::vaend:
-    if (!isa<ConstantPointerNull>(I.getArgOperand(0))) {
-      Out << "0; va_end(*(va_list*)";
-      writeOperand(I.getArgOperand(0));
-      Out << ')';
-    } else {
-      Out << "va_end(*(va_list*)0)";
-    }
-    return true;
-  case Intrinsic::vacopy:
-    Out << "0; ";
-    Out << "va_copy(*(va_list*)";
-    writeOperand(I.getArgOperand(0));
-    Out << ", *(va_list*)";
-    writeOperand(I.getArgOperand(1));
-    Out << ')';
-    return true;
-  case Intrinsic::returnaddress:
-    Out << "__builtin_return_address(";
-    writeOperand(I.getArgOperand(0));
-    Out << ')';
-    return true;
-  case Intrinsic::frameaddress:
-    Out << "__builtin_frame_address(";
-    writeOperand(I.getArgOperand(0));
-    Out << ')';
-    return true;
-  case Intrinsic::powi:
-    Out << "__builtin_powi(";
-    writeOperand(I.getArgOperand(0));
-    Out << ", ";
-    writeOperand(I.getArgOperand(1));
-    Out << ')';
-    return true;
-  case Intrinsic::setjmp:
-    Out << "setjmp(*(jmp_buf*)";
-    writeOperand(I.getArgOperand(0));
-    Out << ')';
-    return true;
-  case Intrinsic::longjmp:
-    Out << "longjmp(*(jmp_buf*)";
-    writeOperand(I.getArgOperand(0));
-    Out << ", ";
-    writeOperand(I.getArgOperand(1));
-    Out << ')';
-    return true;
-  case Intrinsic::prefetch:
-    Out << "LLVM_PREFETCH((const void *)";
-    writeOperand(I.getArgOperand(0));
-    Out << ", ";
-    writeOperand(I.getArgOperand(1));
-    Out << ", ";
-    writeOperand(I.getArgOperand(2));
-    Out << ")";
-    return true;
-  case Intrinsic::stacksave:
-    // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save()
-    // to work around GCC bugs (see PR1809).
-    Out << "0; *((void**)&" << GetValueName(&I)
-        << ") = __builtin_stack_save()";
-    return true;
-  case Intrinsic::x86_sse_cmp_ss:
-  case Intrinsic::x86_sse_cmp_ps:
-  case Intrinsic::x86_sse2_cmp_sd:
-  case Intrinsic::x86_sse2_cmp_pd:
-    Out << '(';
-    printType(Out, I.getType());
-    Out << ')';
-    // Multiple GCC builtins multiplex onto this intrinsic.
-    switch (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()) {
-    default: llvm_unreachable("Invalid llvm.x86.sse.cmp!");
-    case 0: Out << "__builtin_ia32_cmpeq"; break;
-    case 1: Out << "__builtin_ia32_cmplt"; break;
-    case 2: Out << "__builtin_ia32_cmple"; break;
-    case 3: Out << "__builtin_ia32_cmpunord"; break;
-    case 4: Out << "__builtin_ia32_cmpneq"; break;
-    case 5: Out << "__builtin_ia32_cmpnlt"; break;
-    case 6: Out << "__builtin_ia32_cmpnle"; break;
-    case 7: Out << "__builtin_ia32_cmpord"; break;
-    }
-    if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd)
-      Out << 'p';
-    else
-      Out << 's';
-    if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps)
-      Out << 's';
-    else
-      Out << 'd';
-
-    Out << "(";
-    writeOperand(I.getArgOperand(0));
-    Out << ", ";
-    writeOperand(I.getArgOperand(1));
-    Out << ")";
-    return true;
-  case Intrinsic::ppc_altivec_lvsl:
-    Out << '(';
-    printType(Out, I.getType());
-    Out << ')';
-    Out << "__builtin_altivec_lvsl(0, (void*)";
-    writeOperand(I.getArgOperand(0));
-    Out << ")";
-    return true;
-  case Intrinsic::uadd_with_overflow:
-  case Intrinsic::sadd_with_overflow:
-    Out << GetValueName(I.getCalledFunction()) << "(";
-    writeOperand(I.getArgOperand(0));
-    Out << ", ";
-    writeOperand(I.getArgOperand(1));
-    Out << ")";
-    return true;
-  }
-}
-
-//This converts the llvm constraint string to something gcc is expecting.
-//TODO: work out platform independent constraints and factor those out
-//      of the per target tables
-//      handle multiple constraint codes
-std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
-  assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle");
-
-  // Grab the translation table from MCAsmInfo if it exists.
-  const MCAsmInfo *TargetAsm;
-  std::string Triple = TheModule->getTargetTriple();
-  if (Triple.empty())
-    Triple = llvm::sys::getDefaultTargetTriple();
-
-  std::string E;
-  if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
-    TargetAsm = Match->createMCAsmInfo(Triple);
-  else
-    return c.Codes[0];
-
-  const char *const *table = TargetAsm->getAsmCBE();
-
-  // Search the translation table if it exists.
-  for (int i = 0; table && table[i]; i += 2)
-    if (c.Codes[0] == table[i]) {
-      delete TargetAsm;
-      return table[i+1];
-    }
-
-  // Default is identity.
-  delete TargetAsm;
-  return c.Codes[0];
-}
-
-//TODO: import logic from AsmPrinter.cpp
-static std::string gccifyAsm(std::string asmstr) {
-  for (std::string::size_type i = 0; i != asmstr.size(); ++i)
-    if (asmstr[i] == '\n')
-      asmstr.replace(i, 1, "\\n");
-    else if (asmstr[i] == '\t')
-      asmstr.replace(i, 1, "\\t");
-    else if (asmstr[i] == '$') {
-      if (asmstr[i + 1] == '{') {
-        std::string::size_type a = asmstr.find_first_of(':', i + 1);
-        std::string::size_type b = asmstr.find_first_of('}', i + 1);
-        std::string n = "%" +
-          asmstr.substr(a + 1, b - a - 1) +
-          asmstr.substr(i + 2, a - i - 2);
-        asmstr.replace(i, b - i + 1, n);
-        i += n.size() - 1;
-      } else
-        asmstr.replace(i, 1, "%");
-    }
-    else if (asmstr[i] == '%')//grr
-      { asmstr.replace(i, 1, "%%"); ++i;}
-
-  return asmstr;
-}
-
-//TODO: assumptions about what consume arguments from the call are likely wrong
-//      handle communitivity
-void CWriter::visitInlineAsm(CallInst &CI) {
-  InlineAsm* as = cast<InlineAsm>(CI.getCalledValue());
-  InlineAsm::ConstraintInfoVector Constraints = as->ParseConstraints();
-
-  std::vector<std::pair<Value*, int> > ResultVals;
-  if (CI.getType() == Type::getVoidTy(CI.getContext()))
-    ;
-  else if (StructType *ST = dyn_cast<StructType>(CI.getType())) {
-    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
-      ResultVals.push_back(std::make_pair(&CI, (int)i));
-  } else {
-    ResultVals.push_back(std::make_pair(&CI, -1));
-  }
-
-  // Fix up the asm string for gcc and emit it.
-  Out << "__asm__ volatile (\"" << gccifyAsm(as->getAsmString()) << "\"\n";
-  Out << "        :";
-
-  unsigned ValueCount = 0;
-  bool IsFirst = true;
-
-  // Convert over all the output constraints.
-  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-       E = Constraints.end(); I != E; ++I) {
-
-    if (I->Type != InlineAsm::isOutput) {
-      ++ValueCount;
-      continue;  // Ignore non-output constraints.
-    }
-
-    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-    std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
-
-    if (!IsFirst) {
-      Out << ", ";
-      IsFirst = false;
-    }
-
-    // Unpack the dest.
-    Value *DestVal;
-    int DestValNo = -1;
-
-    if (ValueCount < ResultVals.size()) {
-      DestVal = ResultVals[ValueCount].first;
-      DestValNo = ResultVals[ValueCount].second;
-    } else
-      DestVal = CI.getArgOperand(ValueCount-ResultVals.size());
-
-    if (I->isEarlyClobber)
-      C = "&"+C;
-
-    Out << "\"=" << C << "\"(" << GetValueName(DestVal);
-    if (DestValNo != -1)
-      Out << ".field" << DestValNo; // Multiple retvals.
-    Out << ")";
-    ++ValueCount;
-  }
-
-
-  // Convert over all the input constraints.
-  Out << "\n        :";
-  IsFirst = true;
-  ValueCount = 0;
-  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-       E = Constraints.end(); I != E; ++I) {
-    if (I->Type != InlineAsm::isInput) {
-      ++ValueCount;
-      continue;  // Ignore non-input constraints.
-    }
-
-    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-    std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
-
-    if (!IsFirst) {
-      Out << ", ";
-      IsFirst = false;
-    }
-
-    assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
-    Value *SrcVal = CI.getArgOperand(ValueCount-ResultVals.size());
-
-    Out << "\"" << C << "\"(";
-    if (!I->isIndirect)
-      writeOperand(SrcVal);
-    else
-      writeOperandDeref(SrcVal);
-    Out << ")";
-  }
-
-  // Convert over the clobber constraints.
-  IsFirst = true;
-  for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-       E = Constraints.end(); I != E; ++I) {
-    if (I->Type != InlineAsm::isClobber)
-      continue;  // Ignore non-input constraints.
-
-    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
-    std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
-
-    if (!IsFirst) {
-      Out << ", ";
-      IsFirst = false;
-    }
-
-    Out << '\"' << C << '"';
-  }
-
-  Out << ")";
-}
-
-void CWriter::visitAllocaInst(AllocaInst &I) {
-  Out << '(';
-  printType(Out, I.getType());
-  Out << ") alloca(sizeof(";
-  printType(Out, I.getType()->getElementType());
-  Out << ')';
-  if (I.isArrayAllocation()) {
-    Out << " * " ;
-    writeOperand(I.getOperand(0));
-  }
-  Out << ')';
-}
-
-void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
-                                 gep_type_iterator E, bool Static) {
-
-  // If there are no indices, just print out the pointer.
-  if (I == E) {
-    writeOperand(Ptr);
-    return;
-  }
-
-  // Find out if the last index is into a vector.  If so, we have to print this
-  // specially.  Since vectors can't have elements of indexable type, only the
-  // last index could possibly be of a vector element.
-  VectorType *LastIndexIsVector = 0;
-  {
-    for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
-      LastIndexIsVector = dyn_cast<VectorType>(*TmpI);
-  }
-
-  Out << "(";
-
-  // If the last index is into a vector, we can't print it as &a[i][j] because
-  // we can't index into a vector with j in GCC.  Instead, emit this as
-  // (((float*)&a[i])+j)
-  if (LastIndexIsVector) {
-    Out << "((";
-    printType(Out, PointerType::getUnqual(LastIndexIsVector->getElementType()));
-    Out << ")(";
-  }
-
-  Out << '&';
-
-  // If the first index is 0 (very typical) we can do a number of
-  // simplifications to clean up the code.
-  Value *FirstOp = I.getOperand();
-  if (!isa<Constant>(FirstOp) || !cast<Constant>(FirstOp)->isNullValue()) {
-    // First index isn't simple, print it the hard way.
-    writeOperand(Ptr);
-  } else {
-    ++I;  // Skip the zero index.
-
-    // Okay, emit the first operand. If Ptr is something that is already address
-    // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
-    if (isAddressExposed(Ptr)) {
-      writeOperandInternal(Ptr, Static);
-    } else if (I != E && (*I)->isStructTy()) {
-      // If we didn't already emit the first operand, see if we can print it as
-      // P->f instead of "P[0].f"
-      writeOperand(Ptr);
-      Out << "->field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
-      ++I;  // eat the struct index as well.
-    } else {
-      // Instead of emitting P[0][1], emit (*P)[1], which is more idiomatic.
-      Out << "(*";
-      writeOperand(Ptr);
-      Out << ")";
-    }
-  }
-
-  for (; I != E; ++I) {
-    if ((*I)->isStructTy()) {
-      Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
-    } else if ((*I)->isArrayTy()) {
-      Out << ".array[";
-      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
-      Out << ']';
-    } else if (!(*I)->isVectorTy()) {
-      Out << '[';
-      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
-      Out << ']';
-    } else {
-      // If the last index is into a vector, then print it out as "+j)".  This
-      // works with the 'LastIndexIsVector' code above.
-      if (isa<Constant>(I.getOperand()) &&
-          cast<Constant>(I.getOperand())->isNullValue()) {
-        Out << "))";  // avoid "+0".
-      } else {
-        Out << ")+(";
-        writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
-        Out << "))";
-      }
-    }
-  }
-  Out << ")";
-}
-
-void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType,
-                                bool IsVolatile, unsigned Alignment) {
-
-  bool IsUnaligned = Alignment &&
-    Alignment < TD->getABITypeAlignment(OperandType);
-
-  if (!IsUnaligned)
-    Out << '*';
-  if (IsVolatile || IsUnaligned) {
-    Out << "((";
-    if (IsUnaligned)
-      Out << "struct __attribute__ ((packed, aligned(" << Alignment << "))) {";
-    printType(Out, OperandType, false, IsUnaligned ? "data" : "volatile*");
-    if (IsUnaligned) {
-      Out << "; } ";
-      if (IsVolatile) Out << "volatile ";
-      Out << "*";
-    }
-    Out << ")";
-  }
-
-  writeOperand(Operand);
-
-  if (IsVolatile || IsUnaligned) {
-    Out << ')';
-    if (IsUnaligned)
-      Out << "->data";
-  }
-}
-
-void CWriter::visitLoadInst(LoadInst &I) {
-  writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(),
-                    I.getAlignment());
-
-}
-
-void CWriter::visitStoreInst(StoreInst &I) {
-  writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(),
-                    I.isVolatile(), I.getAlignment());
-  Out << " = ";
-  Value *Operand = I.getOperand(0);
-  Constant *BitMask = 0;
-  if (IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
-    if (!ITy->isPowerOf2ByteWidth())
-      // We have a bit width that doesn't match an even power-of-2 byte
-      // size. Consequently we must & the value with the type's bit mask
-      BitMask = ConstantInt::get(ITy, ITy->getBitMask());
-  if (BitMask)
-    Out << "((";
-  writeOperand(Operand);
-  if (BitMask) {
-    Out << ") & ";
-    printConstant(BitMask, false);
-    Out << ")";
-  }
-}
-
-void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) {
-  printGEPExpression(I.getPointerOperand(), gep_type_begin(I),
-                     gep_type_end(I), false);
-}
-
-void CWriter::visitVAArgInst(VAArgInst &I) {
-  Out << "va_arg(*(va_list*)";
-  writeOperand(I.getOperand(0));
-  Out << ", ";
-  printType(Out, I.getType());
-  Out << ");\n ";
-}
-
-void CWriter::visitInsertElementInst(InsertElementInst &I) {
-  Type *EltTy = I.getType()->getElementType();
-  writeOperand(I.getOperand(0));
-  Out << ";\n  ";
-  Out << "((";
-  printType(Out, PointerType::getUnqual(EltTy));
-  Out << ")(&" << GetValueName(&I) << "))[";
-  writeOperand(I.getOperand(2));
-  Out << "] = (";
-  writeOperand(I.getOperand(1));
-  Out << ")";
-}
-
-void CWriter::visitExtractElementInst(ExtractElementInst &I) {
-  // We know that our operand is not inlined.
-  Out << "((";
-  Type *EltTy =
-    cast<VectorType>(I.getOperand(0)->getType())->getElementType();
-  printType(Out, PointerType::getUnqual(EltTy));
-  Out << ")(&" << GetValueName(I.getOperand(0)) << "))[";
-  writeOperand(I.getOperand(1));
-  Out << "]";
-}
-
-void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
-  Out << "(";
-  printType(Out, SVI.getType());
-  Out << "){ ";
-  VectorType *VT = SVI.getType();
-  unsigned NumElts = VT->getNumElements();
-  Type *EltTy = VT->getElementType();
-
-  for (unsigned i = 0; i != NumElts; ++i) {
-    if (i) Out << ", ";
-    int SrcVal = SVI.getMaskValue(i);
-    if ((unsigned)SrcVal >= NumElts*2) {
-      Out << " 0/*undef*/ ";
-    } else {
-      Value *Op = SVI.getOperand((unsigned)SrcVal >= NumElts);
-      if (isa<Instruction>(Op)) {
-        // Do an extractelement of this value from the appropriate input.
-        Out << "((";
-        printType(Out, PointerType::getUnqual(EltTy));
-        Out << ")(&" << GetValueName(Op)
-            << "))[" << (SrcVal & (NumElts-1)) << "]";
-      } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) {
-        Out << "0";
-      } else {
-        printConstant(cast<ConstantVector>(Op)->getOperand(SrcVal &
-                                                           (NumElts-1)),
-                      false);
-      }
-    }
-  }
-  Out << "}";
-}
-
-void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
-  // Start by copying the entire aggregate value into the result variable.
-  writeOperand(IVI.getOperand(0));
-  Out << ";\n  ";
-
-  // Then do the insert to update the field.
-  Out << GetValueName(&IVI);
-  for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
-       i != e; ++i) {
-    Type *IndexedTy =
-      ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(),
-                                       makeArrayRef(b, i+1));
-    if (IndexedTy->isArrayTy())
-      Out << ".array[" << *i << "]";
-    else
-      Out << ".field" << *i;
-  }
-  Out << " = ";
-  writeOperand(IVI.getOperand(1));
-}
-
-void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
-  Out << "(";
-  if (isa<UndefValue>(EVI.getOperand(0))) {
-    Out << "(";
-    printType(Out, EVI.getType());
-    Out << ") 0/*UNDEF*/";
-  } else {
-    Out << GetValueName(EVI.getOperand(0));
-    for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
-         i != e; ++i) {
-      Type *IndexedTy =
-        ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(),
-                                         makeArrayRef(b, i+1));
-      if (IndexedTy->isArrayTy())
-        Out << ".array[" << *i << "]";
-      else
-        Out << ".field" << *i;
-    }
-  }
-  Out << ")";
-}
-
-//===----------------------------------------------------------------------===//
-//                       External Interface declaration
-//===----------------------------------------------------------------------===//
-
-bool CTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
-                                         formatted_raw_ostream &o,
-                                         CodeGenFileType FileType,
-                                         bool DisableVerify) {
-  if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
-
-  PM.add(createGCLoweringPass());
-  PM.add(createLowerInvokePass());
-  PM.add(createCFGSimplificationPass());   // clean up after lower invoke.
-  PM.add(new CWriter(o));
-  PM.add(createGCInfoDeleter());
-  return false;
-}
diff --git a/lib/Target/CBackend/CMakeLists.txt b/lib/Target/CBackend/CMakeLists.txt
deleted file mode 100644
index fa819a4..0000000
--- a/lib/Target/CBackend/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-add_llvm_target(CBackendCodeGen
-  CBackend.cpp
-  )
-
-add_subdirectory(TargetInfo)
diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
deleted file mode 100644
index 8b2286e..0000000
--- a/lib/Target/CBackend/CTargetMachine.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//===-- CTargetMachine.h - TargetMachine for the C backend ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the TargetMachine that is used by the C backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef CTARGETMACHINE_H
-#define CTARGETMACHINE_H
-
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
-
-namespace llvm {
-
-struct CTargetMachine : public TargetMachine {
-  CTargetMachine(const Target &T, StringRef TT,
-                 StringRef CPU, StringRef FS, const TargetOptions &Options,
-                 Reloc::Model RM, CodeModel::Model CM,
-                 CodeGenOpt::Level OL)
-    : TargetMachine(T, TT, CPU, FS, Options) { }
-
-  virtual bool addPassesToEmitFile(PassManagerBase &PM,
-                                   formatted_raw_ostream &Out,
-                                   CodeGenFileType FileType,
-                                   bool DisableVerify);
-  
-  virtual const TargetData *getTargetData() const { return 0; }
-};
-
-extern Target TheCBackendTarget;
-
-} // End llvm namespace
-
-
-#endif
diff --git a/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp b/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
deleted file mode 100644
index e8274ff..0000000
--- a/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- CBackendTargetInfo.cpp - CBackend Target Implementation -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CTargetMachine.h"
-#include "llvm/Module.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-Target llvm::TheCBackendTarget;
-
-extern "C" void LLVMInitializeCBackendTargetInfo() { 
-  RegisterTarget<> X(TheCBackendTarget, "c", "C backend");
-}
-
-extern "C" void LLVMInitializeCBackendTargetMC() {}
diff --git a/lib/Target/CBackend/TargetInfo/CMakeLists.txt b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
deleted file mode 100644
index 6203616..0000000
--- a/lib/Target/CBackend/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMCBackendInfo
-  CBackendTargetInfo.cpp
-  )
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index d8bc743..5913a9c 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMTarget
   TargetLibraryInfo.cpp
   TargetLoweringObjectFile.cpp
   TargetMachine.cpp
+  TargetMachineC.cpp
   TargetRegisterInfo.cpp
   TargetSubtargetInfo.cpp
   )
diff --git a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
index a3717b0..d26449e 100644
--- a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
+++ b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
@@ -15,9 +15,7 @@
 #define SPUMCTARGETDESC_H
 
 namespace llvm {
-class MCSubtargetInfo;
 class Target;
-class StringRef;
 
 extern Target TheCellSPUTarget;
 
diff --git a/lib/Target/CellSPU/SPUCallingConv.td b/lib/Target/CellSPU/SPUCallingConv.td
index 9f9692b..9bc6be7 100644
--- a/lib/Target/CellSPU/SPUCallingConv.td
+++ b/lib/Target/CellSPU/SPUCallingConv.td
@@ -11,10 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-/// CCIfSubtarget - Match if the current subtarget has a feature F.
-class CCIfSubtarget<string F, CCAction A>
- : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
-
 //===----------------------------------------------------------------------===//
 // Return Value Calling Convention
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 55b3f72..0623741 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -3158,7 +3158,6 @@ SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 //! Compute used/known bits for a SPU operand
 void
 SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                  const APInt &Mask,
                                                   APInt &KnownZero,
                                                   APInt &KnownOne,
                                                   const SelectionDAG &DAG,
@@ -3224,7 +3223,7 @@ bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
   return (V > -(1 << 18) && V < (1 << 18) - 1);
 }
 
-bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
+bool SPUTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
   return false;
 }
 
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index 25c5355..e3db7b2 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -121,7 +121,6 @@ namespace llvm {
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
     virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                const APInt &Mask,
                                                 APInt &KnownZero,
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
diff --git a/lib/Target/CellSPU/SPUSubtarget.cpp b/lib/Target/CellSPU/SPUSubtarget.cpp
index ebfefe2..eec2d25 100644
--- a/lib/Target/CellSPU/SPUSubtarget.cpp
+++ b/lib/Target/CellSPU/SPUSubtarget.cpp
@@ -15,7 +15,6 @@
 #include "SPU.h"
 #include "SPURegisterInfo.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/ADT/SmallVector.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 107c6cc..69f0ff8 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -33,8 +33,9 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
 #include <algorithm>
-#include <set>
+#include <cstdio>
 #include <map>
+#include <set>
 using namespace llvm;
 
 static cl::opt<std::string>
@@ -195,6 +196,18 @@ void CppWriter::error(const std::string& msg) {
   report_fatal_error(msg);
 }
 
+static inline std::string ftostr(const APFloat& V) {
+  std::string Buf;
+  if (&V.getSemantics() == &APFloat::IEEEdouble) {
+    raw_string_ostream(Buf) << V.convertToDouble();
+    return Buf;
+  } else if (&V.getSemantics() == &APFloat::IEEEsingle) {
+    raw_string_ostream(Buf) << (double)V.convertToFloat();
+    return Buf;
+  }
+  return "<unknown format in ftostr>"; // error
+}
+
 // printCFP - Print a floating point constant .. very carefully :)
 // This makes sure that conversion to/from floating yields the same binary
 // result so that we don't lose precision.
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 6c5da72..af9e813 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_target(HexagonCodeGen
   HexagonExpandPredSpillCode.cpp
   HexagonFrameLowering.cpp
   HexagonHardwareLoops.cpp
+  HexagonMCInstLower.cpp
   HexagonInstrInfo.cpp
   HexagonISelDAGToDAG.cpp
   HexagonISelLowering.cpp
@@ -27,8 +28,9 @@ add_llvm_target(HexagonCodeGen
   HexagonSubtarget.cpp
   HexagonTargetMachine.cpp
   HexagonTargetObjectFile.cpp
-  )
+)
 
 add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
 
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index 270c7a7..0808323 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -17,10 +17,14 @@
 
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
   class FunctionPass;
   class TargetMachine;
+  class MachineInstr;
+  class MCInst;
+  class HexagonAsmPrinter;
   class HexagonTargetMachine;
   class raw_ostream;
 
@@ -30,13 +34,23 @@ namespace llvm {
   FunctionPass *createHexagonRemoveExtendOps(HexagonTargetMachine &TM);
   FunctionPass *createHexagonCFGOptimizer(HexagonTargetMachine &TM);
 
-  FunctionPass* createHexagonSplitTFRCondSets(HexagonTargetMachine &TM);
-  FunctionPass* createHexagonExpandPredSpillCode(HexagonTargetMachine &TM);
+  FunctionPass *createHexagonSplitTFRCondSets(HexagonTargetMachine &TM);
+  FunctionPass *createHexagonExpandPredSpillCode(HexagonTargetMachine &TM);
 
   FunctionPass *createHexagonHardwareLoops();
   FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonFixupHwLoops();
 
+/* TODO: object output.
+  MCCodeEmitter *createHexagonMCCodeEmitter(const Target &,
+                                            TargetMachine &TM,
+                                            MCContext &Ctx);
+*/
+/* TODO: assembler input.
+  TargetAsmBackend *createHexagonAsmBackend(const Target &, const std::string &);
+*/
+  void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI,
+                        HexagonAsmPrinter &AP);
 } // end namespace llvm;
 
 #define Hexagon_POINTER_SIZE 4
@@ -50,4 +64,10 @@ namespace llvm {
 // a new stack frame. This takes 8 bytes.
 #define HEXAGON_LRFP_SIZE 8
 
+// Normal instruction size (in bytes).
+#define HEXAGON_INSTR_SIZE 4
+
+// Maximum number of words in a packet (in instructions).
+#define HEXAGON_PACKET_SIZE 4
+
 #endif
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index ab5093d..4a50d16 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -39,10 +39,7 @@ include "HexagonInstrInfo.td"
 include "HexagonIntrinsics.td"
 include "HexagonIntrinsicsDerived.td"
 
-
-def HexagonInstrInfo : InstrInfo {
-  // Define how we want to layout our target-specific information field.
-}
+def HexagonInstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
 // Hexagon processors supported.
@@ -56,6 +53,13 @@ def : Proc<"hexagonv2", HexagonItineraries,   [ArchV2]>;
 def : Proc<"hexagonv3", HexagonItineraries,   [ArchV2, ArchV3]>;
 def : Proc<"hexagonv4", HexagonItinerariesV4, [ArchV2, ArchV3, ArchV4]>;
 
+// Hexagon Uses the MC printer for assembler output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def HexagonAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
 //===----------------------------------------------------------------------===//
@@ -63,4 +67,6 @@ def : Proc<"hexagonv4", HexagonItinerariesV4, [ArchV2, ArchV3, ArchV4]>;
 def Hexagon : Target {
   // Pull in Instruction Info:
   let InstructionSet = HexagonInstrInfo;
+
+  let AssemblyWriters = [HexagonAsmWriter];
 }
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index bf333b7..39bf45d 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -16,25 +16,33 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
 #include "HexagonTargetMachine.h"
 #include "HexagonSubtarget.h"
-#include "HexagonMachineFunctionInfo.h"
+#include "InstPrinter/HexagonInstPrinter.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/Mangler.h"
@@ -43,8 +51,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 
 using namespace llvm;
@@ -53,163 +61,9 @@ static cl::opt<bool> AlignCalls(
          "hexagon-align-calls", cl::Hidden, cl::init(true),
           cl::desc("Insert falign after call instruction for Hexagon target"));
 
-
-namespace {
-  class HexagonAsmPrinter : public AsmPrinter {
-    const HexagonSubtarget *Subtarget;
-
-  public:
-    explicit HexagonAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer) {
-      Subtarget = &TM.getSubtarget<HexagonSubtarget>();
-    }
-
-    virtual const char *getPassName() const {
-      return "Hexagon Assembly Printer";
-    }
-
-    /// printInstruction - This method is automatically generated by tablegen
-    /// from the instruction set description.  This method returns true if the
-    /// machine instruction was sufficiently described to print it, otherwise it
-    void printInstruction(const MachineInstr *MI, raw_ostream &O);
-    virtual void EmitInstruction(const MachineInstr *MI);
-
-    void printOp(const MachineOperand &MO, raw_ostream &O);
-
-    /// printRegister - Print register according to target requirements.
-    ///
-    void printRegister(const MachineOperand &MO, bool R0AsZero,
-                       raw_ostream &O) {
-      unsigned RegNo = MO.getReg();
-      assert(TargetRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??");
-      O << getRegisterName(RegNo);
-    }
-
-    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &OS) {
-      const MachineOperand &MO = MI->getOperand(OpNo);
-      if (MO.isReg()) {
-        printRegister(MO, false, OS);
-      } else if (MO.isImm()) {
-        OS << MO.getImm();
-      } else {
-        printOp(MO, OS);
-      }
-    }
-
-
-    bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
-
-    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                         unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &OS);
-    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                               unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &OS);
-
-
-    void printHexagonImmOperand(const MachineInstr *MI, unsigned OpNo,
-                                raw_ostream &O) {
-      int value = MI->getOperand(OpNo).getImm();
-      O << value;
-    }
-
-
-    void printHexagonNegImmOperand(const MachineInstr *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-      int value = MI->getOperand(OpNo).getImm();
-      O << -value;
-    }
-
-    void printHexagonNOneImmOperand(const MachineInstr *MI, unsigned OpNo,
-                                    raw_ostream &O) const {
-      O << -1;
-    }
-
-    void printHexagonMEMriOperand(const MachineInstr *MI, unsigned OpNo,
-                                  raw_ostream &O) {
-      const MachineOperand &MO1 = MI->getOperand(OpNo);
-      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
-
-      O << getRegisterName(MO1.getReg())
-        << " + #"
-        << (int) MO2.getImm();
-    }
-
-
-    void printHexagonFrameIndexOperand(const MachineInstr *MI, unsigned OpNo,
-                                       raw_ostream &O) {
-      const MachineOperand &MO1 = MI->getOperand(OpNo);
-      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
-
-      O << getRegisterName(MO1.getReg())
-        << ", #"
-        << MO2.getImm();
-    }
-
-    void printBranchOperand(const MachineInstr *MI, unsigned OpNo,
-                            raw_ostream &O) {
-      // Branches can take an immediate operand.  This is used by the branch
-      // selection pass to print $+8, an eight byte displacement from the PC.
-      if (MI->getOperand(OpNo).isImm()) {
-        O << "$+" << MI->getOperand(OpNo).getImm()*4;
-      } else {
-        printOp(MI->getOperand(OpNo), O);
-      }
-    }
-
-    void printCallOperand(const MachineInstr *MI, unsigned OpNo,
-                          raw_ostream &O) {
-    }
-
-    void printAbsAddrOperand(const MachineInstr *MI, unsigned OpNo,
-                            raw_ostream &O) {
-    }
-
-
-    void printSymbolHi(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
-      O << "#HI(";
-      if (MI->getOperand(OpNo).isImm()) {
-        printHexagonImmOperand(MI, OpNo, O);
-      } else {
-        printOp(MI->getOperand(OpNo), O);
-      }
-      O << ")";
-    }
-
-    void printSymbolLo(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
-      O << "#HI(";
-      if (MI->getOperand(OpNo).isImm()) {
-        printHexagonImmOperand(MI, OpNo, O);
-      } else {
-        printOp(MI->getOperand(OpNo), O);
-      }
-      O << ")";
-    }
-
-    void printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
-                               raw_ostream &O);
-
-    void printAddrModeBasePlusOffset(const MachineInstr *MI, int OpNo,
-                                     raw_ostream &O);
-
-    void printGlobalOperand(const MachineInstr *MI, int OpNo, raw_ostream &O);
-    void printJumpTable(const MachineInstr *MI, int OpNo, raw_ostream &O);
-
-    void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const;
-
-    static const char *getRegisterName(unsigned RegNo);
-  };
-
-} // end of anonymous namespace
-
-// Include the auto-generated portion of the assembly writer.
-#include "HexagonGenAsmWriter.inc"
-
-
 void HexagonAsmPrinter::EmitAlignment(unsigned NumBits,
                                       const GlobalValue *GV) const {
-
-  // For basic block level alignment, use falign.
+  // For basic block level alignment, use ".falign".
   if (!GV) {
     OutStreamer.EmitRawText(StringRef("\t.falign"));
     return;
@@ -218,12 +72,19 @@ void HexagonAsmPrinter::EmitAlignment(unsigned NumBits,
   AsmPrinter::EmitAlignment(NumBits, GV);
 }
 
-void HexagonAsmPrinter::printOp(const MachineOperand &MO, raw_ostream &O) {
+void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+
   switch (MO.getType()) {
+  default:
+    assert(0 && "<unknown operand type>");
+  case MachineOperand::MO_Register:
+    O << HexagonInstPrinter::getRegisterName(MO.getReg());
+    return;
   case MachineOperand::MO_Immediate:
-    dbgs() << "printOp() does not handle immediate values\n";
-    abort();
-
+    O << MO.getImm();
+    return;
   case MachineOperand::MO_MachineBasicBlock:
     O << *MO.getMBB()->getSymbol();
     return;
@@ -237,20 +98,14 @@ void HexagonAsmPrinter::printOp(const MachineOperand &MO, raw_ostream &O) {
   case MachineOperand::MO_ExternalSymbol:
     O << *GetExternalSymbolSymbol(MO.getSymbolName());
     return;
-  case MachineOperand::MO_GlobalAddress: {
+  case MachineOperand::MO_GlobalAddress:
     // Computing the address of a global symbol, not calling it.
     O << *Mang->getSymbol(MO.getGlobal());
     printOffset(MO.getOffset(), O);
     return;
   }
-
-  default:
-    O << "<unknown operand type: " << MO.getType() << ">";
-    return;
-  }
 }
 
-
 //
 // isBlockOnlyReachableByFallthrough - We need to override this since the
 // default AsmPrinter does not print labels for any basic block that
@@ -273,7 +128,7 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
 bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                                         unsigned AsmVariant,
                                         const char *ExtraCode,
-                                      raw_ostream &OS) {
+                                        raw_ostream &OS) {
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
@@ -341,154 +196,11 @@ void HexagonAsmPrinter::printPredicateOperand(const MachineInstr *MI,
 /// the current output stream.
 ///
 void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  SmallString<128> Str;
-  raw_svector_ostream O(Str);
-
-  const MachineFunction* MF = MI->getParent()->getParent();
-  const HexagonMachineFunctionInfo* MFI =
-    (const HexagonMachineFunctionInfo*)
-    MF->getInfo<HexagonMachineFunctionInfo>();
-
+  MCInst MCI;
 
+  HexagonLowerToMC(MI, MCI, *this);
+  OutStreamer.EmitInstruction(MCI);
 
-  // Print a brace for the beginning of the packet.
-  if (MFI->isStartPacket(MI)) {
-    O << "\t{" << '\n';
-  }
-
-  DEBUG( O << "// MI = " << *MI << '\n';);
-
-  // Indent
-  O << "\t";
-
-
-  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
-    if (MFI->isEndPacket(MI) && MFI->isStartPacket(MI)) {
-      O << "\t{ nop }";
-    } else {
-    O << "}";
-    }
-    printInstruction(MI, O);
-  } else if (MI->getOpcode() == Hexagon::MPYI_rin) {
-    // Handle multipy with -ve constant on Hexagon:
-    // "$dst =- mpyi($src1, #$src2)"
-      printOperand(MI, 0, O);
-    O << " =- mpyi(";
-    printOperand(MI, 1, O);
-    O << ", #";
-    printHexagonNegImmOperand(MI, 2, O);
-    O << ")";
-  } else if (MI->getOpcode() == Hexagon::MEMw_ADDSUBi_indexed_MEM_V4) {
-    //
-    // Handle memw(Rs+u6:2) [+-]= #U5
-    //
-    O << "\tmemw("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
-    int addend = MI->getOperand(2).getImm();
-    if (addend < 0)
-      O << "-= " << "#" << -addend << '\n';
-    else
-      O << "+= " << "#" << addend << '\n';
-  } else if (MI->getOpcode() == Hexagon::MEMw_ADDSUBi_MEM_V4) {
-    //
-    // Handle memw(Rs+u6:2) [+-]= #U5
-    //
-    O << "\tmemw("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
-    int addend = MI->getOperand(2).getImm();
-    if (addend < 0)
-      O << "-= " << "#" << -addend << '\n';
-    else
-      O << "+= " << "#" << addend << '\n';
-  } else if (MI->getOpcode() == Hexagon::MEMh_ADDSUBi_indexed_MEM_V4) {
-    //
-    // Handle memh(Rs+u6:1) [+-]= #U5
-    //
-    O << "\tmemh("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
-    int addend = MI->getOperand(2).getImm();
-    if (addend < 0)
-      O << "-= " << "#" << -addend << '\n';
-    else
-      O << "+= " << "#" << addend << '\n';
-  } else if (MI->getOpcode() == Hexagon::MEMh_ADDSUBi_MEM_V4) {
-    //
-    // Handle memh(Rs+u6:1) [+-]= #U5
-    //
-    O << "\tmemh("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
-    int addend = MI->getOperand(2).getImm();
-    if (addend < 0)
-      O << "-= " << "#" << -addend << '\n';
-    else
-      O << "+= " << "#" << addend << '\n';
-  } else if (MI->getOpcode() == Hexagon::MEMb_ADDSUBi_indexed_MEM_V4) {
-    //
-    // Handle memb(Rs+u6:1) [+-]= #U5
-    //
-    O << "\tmemb("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
-    int addend = MI->getOperand(2).getImm();
-    if (addend < 0)
-      O << "-= " << "#" << -addend << '\n';
-    else
-      O << "+= " << "#" << addend << '\n';
-  } else if (MI->getOpcode() == Hexagon::MEMb_ADDSUBi_MEM_V4) {
-    //
-    // Handle memb(Rs+u6:1) [+-]= #U5
-    //
-    O << "\tmemb("; printHexagonMEMriOperand(MI, 0, O); O << ") ";
-    int addend = MI->getOperand(2).getImm();
-    if (addend < 0)
-      O << "-= " << "#" << -addend << '\n';
-    else
-      O << "+= " << "#" << addend << '\n';
-  } else if (MI->getOpcode() == Hexagon::CMPbGTri_V4) {
-    //
-    // Handle Pd=cmpb.gt(Rs,#s8)
-    //
-    O << "\t";
-    printRegister(MI->getOperand(0), false, O);
-    O << " = cmpb.gt(";
-    printRegister(MI->getOperand(1), false, O);
-    O << ", ";
-    int val = MI->getOperand(2).getImm() >> 24;
-    O << "#" << val << ")" << '\n';
-  } else if (MI->getOpcode() == Hexagon::CMPhEQri_V4) {
-    //
-    // Handle Pd=cmph.eq(Rs,#8)
-    //
-    O << "\t";
-    printRegister(MI->getOperand(0), false, O);
-    O << " = cmph.eq(";
-    printRegister(MI->getOperand(1), false, O);
-    O << ", ";
-    int val = MI->getOperand(2).getImm();
-    assert((((0 <= val) && (val <= 127)) ||
-            ((65408 <= val) && (val <= 65535))) &&
-           "Not in correct range!");
-    if (val >= 65408) val -= 65536;
-    O << "#" << val << ")" << '\n';
-  } else if (MI->getOpcode() == Hexagon::CMPhGTri_V4) {
-    //
-    // Handle Pd=cmph.gt(Rs,#8)
-    //
-    O << "\t";
-    printRegister(MI->getOperand(0), false, O);
-    O << " = cmph.gt(";
-    printRegister(MI->getOperand(1), false, O);
-    O << ", ";
-    int val = MI->getOperand(2).getImm() >> 16;
-    O << "#" << val << ")" << '\n';
-  } else {
-    printInstruction(MI, O);
-  }
-
-  // Print a brace for the end of the packet.
-  if (MFI->isEndPacket(MI) && MI->getOpcode() != Hexagon::ENDLOOP0) {
-    O << "\n\t}" << '\n';
-  }
-
-  if (AlignCalls && MI->getDesc().isCall()) {
-    O << "\n\t.falign" << "\n";
-  }
-
-  OutStreamer.EmitRawText(O.str());
   return;
 }
 
@@ -507,7 +219,7 @@ void HexagonAsmPrinter::printAddrModeBasePlusOffset(const MachineInstr *MI,
   const MachineOperand &MO1 = MI->getOperand(OpNo);
   const MachineOperand &MO2 = MI->getOperand(OpNo+1);
 
-  O << getRegisterName(MO1.getReg())
+  O << HexagonInstPrinter::getRegisterName(MO1.getReg())
     << " + #"
     << MO2.getImm();
 }
@@ -536,6 +248,31 @@ void HexagonAsmPrinter::printJumpTable(const MachineInstr *MI, int OpNo,
   O << *GetJTISymbol(MO.getIndex());
 }
 
+void HexagonAsmPrinter::printConstantPool(const MachineInstr *MI, int OpNo,
+                                          raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  assert( (MO.getType() == MachineOperand::MO_ConstantPoolIndex) &&
+          "Expecting constant pool index");
+
+  // Hexagon_TODO: Do we need name mangling?
+  O << *GetCPISymbol(MO.getIndex());
+}
+
+static MCInstPrinter *createHexagonMCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return(new HexagonInstPrinter(MAI, MII, MRI));
+  else
+   return NULL;
+}
+
 extern "C" void LLVMInitializeHexagonAsmPrinter() {
   RegisterAsmPrinter<HexagonAsmPrinter> X(TheHexagonTarget);
+
+  TargetRegistry::RegisterMCInstPrinter(TheHexagonTarget,
+                                        createHexagonMCInstPrinter);
 }
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h
new file mode 100755
index 0000000..bc2af63
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -0,0 +1,165 @@
+//===-- HexagonAsmPrinter.h - Print machine code to an Hexagon .s file ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hexagon Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONASMPRINTER_H
+#define HEXAGONASMPRINTER_H
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class HexagonAsmPrinter : public AsmPrinter {
+    const HexagonSubtarget *Subtarget;
+
+  public:
+    explicit HexagonAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer) {
+      Subtarget = &TM.getSubtarget<HexagonSubtarget>();
+    }
+
+    virtual const char *getPassName() const {
+      return "Hexagon Assembly Printer";
+    }
+
+    bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
+
+    virtual void EmitInstruction(const MachineInstr *MI);
+    virtual void EmitAlignment(unsigned NumBits,
+                               const GlobalValue *GV = 0) const;
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &OS);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS);
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    /// returns false.
+    void printInstruction(const MachineInstr *MI, raw_ostream &O);
+
+    //    void printMachineInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO, raw_ostream &O);
+
+    /// printRegister - Print register according to target requirements.
+    ///
+    void printRegister(const MachineOperand &MO, bool R0AsZero,
+                       raw_ostream &O) {
+      unsigned RegNo = MO.getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??");
+      O << getRegisterName(RegNo);
+    }
+
+    void printImmOperand(const MachineInstr *MI, unsigned OpNo,
+                                raw_ostream &O) {
+      int value = MI->getOperand(OpNo).getImm();
+      O << value;
+    }
+
+    void printNegImmOperand(const MachineInstr *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+      int value = MI->getOperand(OpNo).getImm();
+      O << -value;
+    }
+
+    void printMEMriOperand(const MachineInstr *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+      const MachineOperand &MO1 = MI->getOperand(OpNo);
+      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
+
+      O << getRegisterName(MO1.getReg())
+        << " + #"
+        << (int) MO2.getImm();
+    }
+
+    void printFrameIndexOperand(const MachineInstr *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+      const MachineOperand &MO1 = MI->getOperand(OpNo);
+      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
+
+      O << getRegisterName(MO1.getReg())
+        << ", #"
+        << MO2.getImm();
+    }
+
+    void printBranchOperand(const MachineInstr *MI, unsigned OpNo,
+                            raw_ostream &O) {
+      // Branches can take an immediate operand.  This is used by the branch
+      // selection pass to print $+8, an eight byte displacement from the PC.
+      if (MI->getOperand(OpNo).isImm()) {
+        O << "$+" << MI->getOperand(OpNo).getImm()*4;
+      } else {
+        printOp(MI->getOperand(OpNo), O);
+      }
+    }
+
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo,
+                          raw_ostream &O) {
+    }
+
+    void printAbsAddrOperand(const MachineInstr *MI, unsigned OpNo,
+                             raw_ostream &O) {
+    }
+
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      O << "#HI(";
+      if (MI->getOperand(OpNo).isImm()) {
+        printImmOperand(MI, OpNo, O);
+      }
+      else {
+        printOp(MI->getOperand(OpNo), O);
+      }
+      O << ")";
+    }
+
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      O << "#HI(";
+      if (MI->getOperand(OpNo).isImm()) {
+        printImmOperand(MI, OpNo, O);
+      }
+      else {
+        printOp(MI->getOperand(OpNo), O);
+      }
+      O << ")";
+    }
+
+    void printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
+                               raw_ostream &O);
+
+#if 0
+    void printModuleLevelGV(const GlobalVariable* GVar, raw_ostream &O);
+#endif
+
+    void printAddrModeBasePlusOffset(const MachineInstr *MI, int OpNo,
+                                     raw_ostream &O);
+
+    void printGlobalOperand(const MachineInstr *MI, int OpNo, raw_ostream &O);
+    void printJumpTable(const MachineInstr *MI, int OpNo, raw_ostream &O);
+    void printConstantPool(const MachineInstr *MI, int OpNo, raw_ostream &O);
+
+    static const char *getRegisterName(unsigned RegNo);
+
+#if 0
+    void EmitStartOfAsmFile(Module &M);
+#endif
+  };
+
+} // end of llvm namespace
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonImmediates.td b/lib/Target/Hexagon/HexagonImmediates.td
index 18589a2..e78bb79 100644
--- a/lib/Target/Hexagon/HexagonImmediates.td
+++ b/lib/Target/Hexagon/HexagonImmediates.td
@@ -10,211 +10,211 @@
 // From IA64's InstrInfo file
 def s32Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s16Imm : Operand<i32> {
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s12Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s11Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s11_0Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s11_1Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s11_2Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s11_3Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s10Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s9Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s8Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s8Imm64 : Operand<i64> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s6Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s4Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s4_0Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s4_1Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s4_2Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def s4_3Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u64Imm : Operand<i64> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u32Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u16Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u16_0Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u16_1Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u16_2Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u11_3Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u10Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u9Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u8Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u7Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u6Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u6_0Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u6_1Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u6_2Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u6_3Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u5Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u4Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u3Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u2Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def u1Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def n8Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def m6Imm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonImmOperand";
+  let PrintMethod = "printImmOperand";
 }
 
 def nOneImm : Operand<i32> {
   // For now, we use a generic print function for all operands.
-  let PrintMethod = "printHexagonNOneImmOperand";
+  let PrintMethod = "printNOneImmOperand";
 }
 
 //
@@ -494,7 +494,7 @@ def m6ImmPred  : PatLeaf<(i32 imm), [{
 
 //InN means negative integers in [-(2^N - 1), 0]
 def n8ImmPred  : PatLeaf<(i32 imm), [{
-  // n8ImmPred predicate - True if the immediate fits in a 8-bit unsigned
+  // n8ImmPred predicate - True if the immediate fits in a 8-bit signed
   // field.
   int64_t v = (int64_t)N->getSExtValue();
   return (-255 <= v && v <= 0);
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 3d7ace5..77b3663 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
-#include "Hexagon.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index f3c6622..b563ac3 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -40,24 +40,24 @@ def ADDRriU6_2 : ComplexPattern<i32, 2, "SelectADDRriU6_2", [frameindex], []>;
 
 // Address operands.
 def MEMrr : Operand<i32> {
-  let PrintMethod = "printHexagonMEMrrOperand";
+  let PrintMethod = "printMEMrrOperand";
   let MIOperandInfo = (ops IntRegs, IntRegs);
 }
 
 // Address operands
 def MEMri : Operand<i32> {
-  let PrintMethod = "printHexagonMEMriOperand";
+  let PrintMethod = "printMEMriOperand";
   let MIOperandInfo = (ops IntRegs, IntRegs);
 }
 
 def MEMri_s11_2 : Operand<i32>,
   ComplexPattern<i32, 2, "SelectMEMriS11_2", []> {
-  let PrintMethod = "printHexagonMEMriOperand";
+  let PrintMethod = "printMEMriOperand";
   let MIOperandInfo = (ops IntRegs, s11Imm);
 }
 
 def FrameIndex : Operand<i32> {
-  let PrintMethod = "printHexagonFrameIndexOperand";
+  let PrintMethod = "printFrameIndexOperand";
   let MIOperandInfo = (ops IntRegs, s11Imm);
 }
 
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
new file mode 100644
index 0000000..fbb331b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -0,0 +1,93 @@
+//===- HexagonMCInstLower.cpp - Convert Hexagon MachineInstr to an MCInst -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Hexagon MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
+                              HexagonAsmPrinter& Printer) {
+  MCContext &MC = Printer.OutContext;
+  const MCExpr *ME;
+
+  ME = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, MC);
+
+  if (!MO.isJTI() && MO.getOffset())
+    ME = MCBinaryExpr::CreateAdd(ME, MCConstantExpr::Create(MO.getOffset(), MC),
+                                 MC);
+
+  return (MCOperand::CreateExpr(ME));
+}
+
+// Create an MCInst from a MachineInstr
+void llvm::HexagonLowerToMC(const MachineInstr* MI, MCInst& MCI,
+                            HexagonAsmPrinter& AP) {
+  MCI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCO;
+
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      assert(0 && "unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCO = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_FPImmediate: {
+      APFloat Val = MO.getFPImm()->getValueAPF();
+      // FP immediates are used only when setting GPRs, so they may be dealt
+      // with like regular immediates from this point on.
+      MCO = MCOperand::CreateImm(*Val.bitcastToAPInt().getRawData());
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCO = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCO = MCOperand::CreateExpr
+              (MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(),
+               AP.OutContext));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCO = GetSymbolRef(MO, AP.Mang->getSymbol(MO.getGlobal()), AP);
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCO = GetSymbolRef(MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()),
+                         AP);
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP);
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP);
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP);
+      break;
+    }
+
+    MCI.addOperand(MCO);
+  }
+}
diff --git a/lib/Target/Hexagon/InstPrinter/CMakeLists.txt b/lib/Target/Hexagon/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..cb106a8
--- /dev/null
+++ b/lib/Target/Hexagon/InstPrinter/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_llvm_library(LLVMHexagonAsmPrinter
+  HexagonInstPrinter.cpp
+  )
+
+add_dependencies(LLVMHexagonAsmPrinter HexagonCommonTableGen)
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
new file mode 100644
index 0000000..ef36881
--- /dev/null
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -0,0 +1,170 @@
+//===- HexagonInstPrinter.cpp - Convert Hexagon MCInst to assembly syntax -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonInstPrinter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdio>
+
+using namespace llvm;
+
+#define GET_INSTRUCTION_NAME
+#include "HexagonGenAsmWriter.inc"
+
+StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
+  return MII.getName(Opcode);
+}
+
+StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
+  return getRegisterName(RegNo);
+}
+
+void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  const char packetPadding[] = "      ";
+  const char startPacket = '{',
+             endPacket = '}';
+  // TODO: add outer HW loop when it's supported too.
+  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+    MCInst Nop;
+
+    O << packetPadding << startPacket << '\n';
+    Nop.setOpcode(Hexagon::NOP);
+    printInstruction(&Nop, O);
+    O << packetPadding << endPacket;
+  }
+
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  if (MO.isReg()) {
+    O << getRegisterName(MO.getReg());
+  } else if(MO.isExpr()) {
+    O << *MO.getExpr();
+  } else if(MO.isImm()) {
+    printImmOperand(MI, OpNo, O);
+  } else {
+    assert(false && "Unknown operand");
+  }
+}
+
+void HexagonInstPrinter::printImmOperand
+  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+  O << MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) const {
+  O << MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printUnsignedImmOperand
+  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+  O << MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  O << -MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNOneImmOperand
+  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+  O << -1;
+}
+
+void HexagonInstPrinter::printMEMriOperand
+  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+  const MCOperand& MO0 = MI->getOperand(OpNo);
+  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+
+  O << getRegisterName(MO0.getReg());
+  O << " + #" << MO1.getImm();
+}
+
+void HexagonInstPrinter::printFrameIndexOperand
+  (const MCInst *MI, unsigned OpNo, raw_ostream &O) const {
+  const MCOperand& MO0 = MI->getOperand(OpNo);
+  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+
+  O << getRegisterName(MO0.getReg()) << ", #" << MO1.getImm();
+}
+
+void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+  assert(MO.isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+  assert(MO.isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+  assert(MO.isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  // Branches can take an immediate operand.  This is used by the branch
+  // selection pass to print $+8, an eight byte displacement from the PC.
+  assert("Unknown branch operand.");
+}
+
+void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O, bool hi) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  O << '#' << (hi? "HI": "LO") << '(';
+  if (MO.isImm()) {
+    O << '#';
+    printOperand(MI, OpNo, O);
+  } else {
+    assert("Unknown symbol operand");
+    printOperand(MI, OpNo, O);
+  }
+  O << ')';
+}
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
new file mode 100644
index 0000000..dad4334
--- /dev/null
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
@@ -0,0 +1,73 @@
+//===-- HexagonInstPrinter.h - Convert Hexagon MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONINSTPRINTER_H
+#define HEXAGONINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class HexagonInstPrinter : public MCInstPrinter {
+  public:
+    explicit HexagonInstPrinter(const MCAsmInfo &MAI,
+                                const MCInstrInfo &MII,
+                                const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+    virtual StringRef getOpcodeName(unsigned Opcode) const;
+    void printInstruction(const MCInst *MI, raw_ostream &O);
+    StringRef getRegName(unsigned RegNo) const;
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+
+    void printConstantPool(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+
+    void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
+      { printSymbol(MI, OpNo, O, true); }
+    void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
+      { printSymbol(MI, OpNo, O, false); }
+
+    bool isConstExtended(const MCInst *MI) const;
+  protected:
+    void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi)
+           const;
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/CBackend/TargetInfo/LLVMBuild.txt b/lib/Target/Hexagon/InstPrinter/LLVMBuild.txt
index 1b47d8e..8678401 100644
--- a/lib/Target/CBackend/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/Hexagon/InstPrinter/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/CBackend/TargetInfo/LLVMBuild.txt -----------*- Conf -*--===;
+;===- ./lib/Target/Hexagon/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = CBackendInfo
-parent = CBackend
-required_libraries = MC Support Target
-add_to_library_groups = CBackend
+name = HexagonAsmPrinter
+parent = Hexagon
+required_libraries = MC Support
+add_to_library_groups = Hexagon
diff --git a/lib/Target/CBackend/TargetInfo/Makefile b/lib/Target/Hexagon/InstPrinter/Makefile
index d4d5e15..20331d8 100644
--- a/lib/Target/CBackend/TargetInfo/Makefile
+++ b/lib/Target/Hexagon/InstPrinter/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/CBackend/TargetInfo/Makefile -------------*- Makefile -*-===##
+##===- lib/Target/Hexagon/InstPrinter/Makefile ----------------------------===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -7,9 +7,9 @@
 #
 ##===----------------------------------------------------------------------===##
 LEVEL = ../../../..
-LIBRARYNAME = LLVMCBackendInfo
+LIBRARYNAME = LLVMHexagonAsmPrinter
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' Hexagon target directory to grab private headers
 CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt
index 84ea6a0..c6d419a 100644
--- a/lib/Target/Hexagon/LLVMBuild.txt
+++ b/lib/Target/Hexagon/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = TargetInfo MCTargetDesc
+subdirectories = InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
@@ -28,5 +28,5 @@ has_asmprinter = 1
 type = Library
 name = HexagonCodeGen
 parent = Hexagon
-required_libraries = AsmPrinter CodeGen Core HexagonInfo SelectionDAG Support Target MC HexagonDesc
+required_libraries = AsmPrinter CodeGen Core HexagonAsmPrinter HexagonDesc HexagonInfo MC SelectionDAG Support Target
 add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index b18d23a..2238b1a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -17,7 +17,6 @@
 namespace llvm {
 class MCSubtargetInfo;
 class Target;
-class StringRef;
 
 extern Target TheHexagonTarget;
 
diff --git a/lib/Target/Hexagon/Makefile b/lib/Target/Hexagon/Makefile
index 34bc68d..dc387c5 100644
--- a/lib/Target/Hexagon/Makefile
+++ b/lib/Target/Hexagon/Makefile
@@ -16,9 +16,8 @@ BUILT_SOURCES = HexagonGenRegisterInfo.inc \
                 HexagonGenAsmWriter.inc \
                 HexagonGenDAGISel.inc HexagonGenSubtargetInfo.inc \
                 HexagonGenCallingConv.inc \
-                HexagonGenDFAPacketizer.inc \
-                HexagonAsmPrinter.cpp
+                HexagonGenDFAPacketizer.inc
 
-DIRS = TargetInfo MCTargetDesc
+DIRS = InstPrinter TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 5a42ca5..8ec5673 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = ARM CBackend CellSPU CppBackend Hexagon MBlaze MSP430 Mips PTX PowerPC Sparc X86 XCore
+subdirectories = ARM CellSPU CppBackend Hexagon MBlaze MSP430 Mips PTX PowerPC Sparc X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp
index 7105b2e..59a1ed9 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp
@@ -9,9 +9,6 @@
 
 #include "MCTargetDesc/MBlazeBaseInfo.h"
 
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallVector.h"
-
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
index c1b003b..38fb0e8 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
@@ -18,9 +18,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 using namespace llvm;
 
diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
index 236583a..51ba7c3 100644
--- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
+++ b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
@@ -21,15 +21,15 @@ namespace llvm {
 
   class MBlazeInstPrinter : public MCInstPrinter {
   public:
-    MBlazeInstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI)
-      : MCInstPrinter(MAI, MRI) {}
+    MBlazeInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
 
     virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, raw_ostream &O);
     static const char *getRegisterName(unsigned RegNo);
-    static const char *getInstructionName(unsigned Opcode);
 
     void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                       const char *Modifier = 0);
diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
index c751dd8..55fffe3 100644
--- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
@@ -38,7 +38,6 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/Target/MBlaze/MBlazeCallingConv.td b/lib/Target/MBlaze/MBlazeCallingConv.td
index 4962573..00a4219 100644
--- a/lib/Target/MBlaze/MBlazeCallingConv.td
+++ b/lib/Target/MBlaze/MBlazeCallingConv.td
@@ -9,10 +9,6 @@
 // This describes the calling conventions for MBlaze architecture.
 //===----------------------------------------------------------------------===//
 
-/// CCIfSubtarget - Match if the current subtarget has a feature F.
-class CCIfSubtarget<string F, CCAction A>:
-  CCIf<!strconcat("State.getTarget().getSubtarget<MBlazeSubtarget>().", F), A>;
-
 //===----------------------------------------------------------------------===//
 // MBlaze ABI Calling Convention
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/MBlaze/MBlazeELFWriterInfo.cpp b/lib/Target/MBlaze/MBlazeELFWriterInfo.cpp
index 60a65bb..e3c7236 100644
--- a/lib/Target/MBlaze/MBlazeELFWriterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeELFWriterInfo.cpp
@@ -100,8 +100,8 @@ unsigned MBlazeELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
 long int MBlazeELFWriterInfo::computeRelocation(unsigned SymOffset,
                                                 unsigned RelOffset,
                                                 unsigned RelTy) const {
-  if (RelTy == ELF::R_MICROBLAZE_32_PCREL || ELF::R_MICROBLAZE_64_PCREL)
-    return SymOffset - (RelOffset + 4);
-
-  llvm_unreachable("computeRelocation unknown for this relocation type");
+  assert((RelTy == ELF::R_MICROBLAZE_32_PCREL ||
+          RelTy == ELF::R_MICROBLAZE_64_PCREL) &&
+         "computeRelocation unknown for this relocation type");
+  return SymOffset - (RelOffset + 4);
 }
diff --git a/lib/Target/MBlaze/MBlazeELFWriterInfo.h b/lib/Target/MBlaze/MBlazeELFWriterInfo.h
index 63bfc0d..a314eb7 100644
--- a/lib/Target/MBlaze/MBlazeELFWriterInfo.h
+++ b/lib/Target/MBlaze/MBlazeELFWriterInfo.h
@@ -17,6 +17,7 @@
 #include "llvm/Target/TargetELFWriterInfo.h"
 
 namespace llvm {
+  class TargetMachine;
 
   class MBlazeELFWriterInfo : public TargetELFWriterInfo {
   public:
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.cpp b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
index 6531064..d2f14a5 100644
--- a/lib/Target/MBlaze/MBlazeFrameLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
@@ -211,13 +211,13 @@ static void analyzeFrameIndexes(MachineFunction &MF) {
 
 static void interruptFrameLayout(MachineFunction &MF) {
   const Function *F = MF.getFunction();
-  llvm::CallingConv::ID CallConv = F->getCallingConv();
+  CallingConv::ID CallConv = F->getCallingConv();
 
   // If this function is not using either the interrupt_handler
   // calling convention or the save_volatiles calling convention
   // then we don't need to do any additional frame layout.
-  if (CallConv != llvm::CallingConv::MBLAZE_INTR &&
-      CallConv != llvm::CallingConv::MBLAZE_SVOL)
+  if (CallConv != CallingConv::MBLAZE_INTR &&
+      CallConv != CallingConv::MBLAZE_SVOL)
       return;
 
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -228,7 +228,7 @@ static void interruptFrameLayout(MachineFunction &MF) {
   // Determine if the calling convention is the interrupt_handler
   // calling convention. Some pieces of the prologue and epilogue
   // only need to be emitted if we are lowering and interrupt handler.
-  bool isIntr = CallConv == llvm::CallingConv::MBLAZE_INTR;
+  bool isIntr = CallConv == CallingConv::MBLAZE_INTR;
 
   // Determine where to put prologue and epilogue additions
   MachineBasicBlock &MENT   = MF.front();
@@ -347,8 +347,8 @@ void MBlazeFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
-  llvm::CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
-  bool requiresRA = CallConv == llvm::CallingConv::MBLAZE_INTR;
+  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+  bool requiresRA = CallConv == CallingConv::MBLAZE_INTR;
 
   // Determine the correct frame layout
   determineFrameLayout(MF);
@@ -393,8 +393,8 @@ void MBlazeFrameLowering::emitEpilogue(MachineFunction &MF,
 
   DebugLoc dl = MBBI->getDebugLoc();
 
-  llvm::CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
-  bool requiresRA = CallConv == llvm::CallingConv::MBLAZE_INTR;
+  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+  bool requiresRA = CallConv == CallingConv::MBLAZE_INTR;
 
   // Get the FI's where RA and FP are saved.
   int FPOffset = MBlazeFI->getFPStackOffset();
@@ -431,8 +431,8 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-  llvm::CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
-  bool requiresRA = CallConv == llvm::CallingConv::MBLAZE_INTR;
+  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+  bool requiresRA = CallConv == CallingConv::MBLAZE_INTR;
 
   if (MFI->adjustsStack() || requiresRA) {
     MBlazeFI->setRAStackOffset(0);
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 9ef6bb6..edfc335 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -1046,10 +1046,10 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 
   // If this function is using the interrupt_handler calling convention
   // then use "rtid r14, 0" otherwise use "rtsd r15, 8"
-  unsigned Ret = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet
-                                                              : MBlazeISD::Ret;
-  unsigned Reg = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlaze::R14
-                                                              : MBlaze::R15;
+  unsigned Ret = (CallConv == CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet
+                                                        : MBlazeISD::Ret;
+  unsigned Reg = (CallConv == CallingConv::MBLAZE_INTR) ? MBlaze::R14
+                                                        : MBlaze::R15;
   SDValue DReg = DAG.getRegister(Reg, MVT::i32);
 
   if (Flag.getNode())
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h
index 36bf655..977f9a6 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h
@@ -14,7 +14,6 @@
 #ifndef MBLAZETARGETASMINFO_H
 #define MBLAZETARGETASMINFO_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
index 5da0aa7..9a7549b 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
@@ -95,10 +95,11 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 static MCInstPrinter *createMBlazeMCInstPrinter(const Target &T,
                                                 unsigned SyntaxVariant,
                                                 const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
                                                 const MCRegisterInfo &MRI,
                                                 const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
-    return new MBlazeInstPrinter(MAI, MRI);
+    return new MBlazeInstPrinter(MAI, MII, MRI);
   return 0;
 }
 
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
index 088d163..ae82c32 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
@@ -25,7 +25,6 @@ class MCObjectWriter;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
-class formatted_raw_ostream;
 class raw_ostream;
 
 extern Target TheMBlazeTarget;
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index 3fd7ce0..d32eb3a 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -21,8 +21,9 @@ namespace llvm {
 
   class MSP430InstPrinter : public MCInstPrinter {
   public:
-    MSP430InstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI)
-        : MCInstPrinter(MAI, MRI) {}
+    MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
 
     virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index 5e5f3d8..2e328cb 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MSP430MCAsmInfo.h"
+#include "llvm/ADT/StringRef.h"
 using namespace llvm;
 
 void MSP430MCAsmInfo::anchor() { }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index 690fc19..e5c2fc2 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -14,10 +14,10 @@
 #ifndef MSP430TARGETASMINFO_H
 #define MSP430TARGETASMINFO_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
+  class StringRef;
   class Target;
 
   class MSP430MCAsmInfo : public MCAsmInfo {
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 8545055..c455f6b 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -61,10 +61,11 @@ static MCCodeGenInfo *createMSP430MCCodeGenInfo(StringRef TT, Reloc::Model RM,
 static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
                                                 unsigned SyntaxVariant,
                                                 const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
                                                 const MCRegisterInfo &MRI,
                                                 const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
-    return new MSP430InstPrinter(MAI, MRI);
+    return new MSP430InstPrinter(MAI, MII, MRI);
   return 0;
 }
 
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
index 35f2590..7f3505c 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -15,9 +15,7 @@
 #define MSP430MCTARGETDESC_H
 
 namespace llvm {
-class MCSubtargetInfo;
 class Target;
-class StringRef;
 
 extern Target TheMSP430Target;
 
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index 53ad155..786a0c5 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -22,12 +22,13 @@
 #include "llvm/ADT/Twine.h"
 using namespace llvm;
 
-static bool isAcceptableChar(char C, bool AllowPeriod) {
+static bool isAcceptableChar(char C, bool AllowPeriod, bool AllowUTF8) {
   if ((C < 'a' || C > 'z') &&
       (C < 'A' || C > 'Z') &&
       (C < '0' || C > '9') &&
       C != '_' && C != '$' && C != '@' &&
-      !(AllowPeriod && C == '.'))
+      !(AllowPeriod && C == '.') &&
+      !(AllowUTF8 && (C & 0x80)))
     return false;
   return true;
 }
@@ -56,8 +57,9 @@ static bool NameNeedsEscaping(StringRef Str, const MCAsmInfo &MAI) {
   // If any of the characters in the string is an unacceptable character, force
   // quotes.
   bool AllowPeriod = MAI.doesAllowPeriodsInName();
+  bool AllowUTF8 = MAI.doesAllowUTF8();
   for (unsigned i = 0, e = Str.size(); i != e; ++i)
-    if (!isAcceptableChar(Str[i], AllowPeriod))
+    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
       return true;
   return false;
 }
@@ -74,8 +76,9 @@ static void appendMangledName(SmallVectorImpl<char> &OutName, StringRef Str,
   }
 
   bool AllowPeriod = MAI.doesAllowPeriodsInName();
+  bool AllowUTF8 = MAI.doesAllowUTF8();
   for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (!isAcceptableChar(Str[i], AllowPeriod))
+    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
       MangleLetter(OutName, Str[i]);
     else
       OutName.push_back(Str[i]);
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 13d17e4..0500c5d 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -2,12 +2,14 @@ set(LLVM_TARGET_DEFINITIONS Mips.td)
 
 tablegen(LLVM MipsGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM MipsGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM MipsGenCodeEmitter.inc -gen-emitter)
 tablegen(LLVM MipsGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM MipsGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM MipsGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM MipsGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM MipsGenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM MipsGenEDInfo.inc -gen-enhanced-disassembly-info)
 add_public_tablegen_target(MipsCommonTableGen)
 
 add_llvm_target(MipsCodeGen
@@ -32,6 +34,7 @@ add_llvm_target(MipsCodeGen
   )
 
 add_subdirectory(InstPrinter)
+add_subdirectory(Disassembler)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(AsmParser)
diff --git a/lib/Target/Mips/Disassembler/CMakeLists.txt b/lib/Target/Mips/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..fe1dc75
--- /dev/null
+++ b/lib/Target/Mips/Disassembler/CMakeLists.txt
@@ -0,0 +1,15 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMipsDisassembler
+  MipsDisassembler.cpp
+  )
+
+# workaround for hanging compilation on MSVC9 and 10
+if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
+set_property(
+  SOURCE MipsDisassembler.cpp
+  PROPERTY COMPILE_FLAGS "/Od"
+  )
+endif()
+
+add_dependencies(LLVMMipsDisassembler MipsCommonTableGen)
diff --git a/lib/Target/CBackend/LLVMBuild.txt b/lib/Target/Mips/Disassembler/LLVMBuild.txt
index e64feb0..048ad0d 100644
--- a/lib/Target/CBackend/LLVMBuild.txt
+++ b/lib/Target/Mips/Disassembler/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/CBackend/LLVMBuild.txt ----------------------*- Conf -*--===;
+;===- ./lib/Target/Mips/Disassembler/LLVMBuild.txt --------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -15,17 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
-[common]
-subdirectories = TargetInfo
-
 [component_0]
-type = TargetGroup
-name = CBackend
-parent = Target
-
-[component_1]
 type = Library
-name = CBackendCodeGen
-parent = CBackend
-required_libraries = Analysis CBackendInfo CodeGen Core MC Scalar Support Target TransformUtils
-add_to_library_groups = CBackend
+name = MipsDisassembler
+parent = Mips
+required_libraries = MC Support MipsInfo
+add_to_library_groups = Mips
diff --git a/lib/Target/CBackend/Makefile b/lib/Target/Mips/Disassembler/Makefile
index bac3474..a78feba 100644
--- a/lib/Target/CBackend/Makefile
+++ b/lib/Target/Mips/Disassembler/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/CBackend/Makefile ------------------------*- Makefile -*-===##
+##===- lib/Target/Mips/Disassembler/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -7,10 +7,10 @@
 #
 ##===----------------------------------------------------------------------===##
 
-LEVEL = ../../..
-LIBRARYNAME = LLVMCBackendCodeGen
-DIRS = TargetInfo
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMipsDisassembler
 
-include $(LEVEL)/Makefile.common
+# Hack: we need to include 'main' Mips target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
-CompileCommonOpts += -Wno-format
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
new file mode 100644
index 0000000..78dbc06
--- /dev/null
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -0,0 +1,552 @@
+//===- MipsDisassembler.cpp - Disassembler for Mips -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Mips Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "llvm/MC/EDInstInfo.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
+
+
+#include "MipsGenEDInfo.inc"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+/// MipsDisassembler - a disasembler class for Mips32.
+class MipsDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  MipsDisassembler(const MCSubtargetInfo &STI, bool bigEndian) :
+    MCDisassembler(STI), isBigEndian(bigEndian) {
+  }
+
+  ~MipsDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+  /// getEDInfo - See MCDisassembler.
+  const EDInstInfo *getEDInfo() const;
+
+private:
+  bool isBigEndian;
+};
+
+
+/// Mips64Disassembler - a disasembler class for Mips64.
+class Mips64Disassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  Mips64Disassembler(const MCSubtargetInfo &STI, bool bigEndian) :
+    MCDisassembler(STI), isBigEndian(bigEndian) {
+  }
+
+  ~Mips64Disassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+  /// getEDInfo - See MCDisassembler.
+  const EDInstInfo *getEDInfo() const;
+
+private:
+  bool isBigEndian;
+};
+
+const EDInstInfo *MipsDisassembler::getEDInfo() const {
+  return instInfoMips;
+}
+
+const EDInstInfo *Mips64Disassembler::getEDInfo() const {
+  return instInfoMips;
+}
+
+// Decoder tables for Mips register
+static const unsigned CPURegsTable[] = {
+  Mips::ZERO, Mips::AT, Mips::V0, Mips::V1,
+  Mips::A0, Mips::A1, Mips::A2, Mips::A3,
+  Mips::T0, Mips::T1, Mips::T2, Mips::T3,
+  Mips::T4, Mips::T5, Mips::T6, Mips::T7,
+  Mips::S0, Mips::S1, Mips::S2, Mips::S3,
+  Mips::S4, Mips::S5, Mips::S6, Mips::S7,
+  Mips::T8, Mips::T9, Mips::K0, Mips::K1,
+  Mips::GP, Mips::SP, Mips::FP, Mips::RA
+};
+
+static const unsigned FGR32RegsTable[] = {
+  Mips::F0, Mips::F1, Mips::F2, Mips::F3,
+  Mips::F4, Mips::F5, Mips::F6, Mips::F7,
+  Mips::F8, Mips::F9, Mips::F10, Mips::F11,
+  Mips::F12, Mips::F13, Mips::F14, Mips::F15,
+  Mips::F16, Mips::F17, Mips::F18, Mips::F18,
+  Mips::F20, Mips::F21, Mips::F22, Mips::F23,
+  Mips::F24, Mips::F25, Mips::F26, Mips::F27,
+  Mips::F28, Mips::F29, Mips::F30, Mips::F31
+};
+
+static const unsigned CPU64RegsTable[] = {
+  Mips::ZERO_64, Mips::AT_64, Mips::V0_64, Mips::V1_64,
+  Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
+  Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64,
+  Mips::T4_64, Mips::T5_64, Mips::T6_64, Mips::T7_64,
+  Mips::S0_64, Mips::S1_64, Mips::S2_64, Mips::S3_64,
+  Mips::S4_64, Mips::S5_64, Mips::S6_64, Mips::S7_64,
+  Mips::T8_64, Mips::T9_64, Mips::K0_64, Mips::K1_64,
+  Mips::GP_64, Mips::SP_64, Mips::FP_64, Mips::RA_64
+};
+
+static const unsigned FGR64RegsTable[] = {
+  Mips::D0_64,  Mips::D1_64,  Mips::D2_64,  Mips::D3_64,
+  Mips::D4_64,  Mips::D5_64,  Mips::D6_64,  Mips::D7_64,
+  Mips::D8_64,  Mips::D9_64,  Mips::D10_64, Mips::D11_64,
+  Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
+  Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64,
+  Mips::D20_64, Mips::D21_64, Mips::D22_64, Mips::D23_64,
+  Mips::D24_64, Mips::D25_64, Mips::D26_64, Mips::D27_64,
+  Mips::D28_64, Mips::D29_64, Mips::D30_64, Mips::D31_64
+};
+
+static const unsigned AFGR64RegsTable[] = {
+  Mips::D0,  Mips::D1,  Mips::D2,  Mips::D3,
+  Mips::D4,  Mips::D5,  Mips::D6,  Mips::D7,
+  Mips::D8,  Mips::D9,  Mips::D10, Mips::D11,
+  Mips::D12, Mips::D13, Mips::D14, Mips::D15
+};
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
+static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
+                                                unsigned Insn,
+                                                uint64_t Address,
+                                                const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+                                       unsigned Offset,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeBC1(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder);
+
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder);
+
+static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodeSimm16(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder);
+
+static DecodeStatus DecodeCondCode(MCInst &Inst,
+                                   unsigned Insn,
+                                   uint64_t Address,
+                                   const void *Decoder);
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+static DecodeStatus DecodeExtSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+namespace llvm {
+extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
+              TheMips64elTarget;
+}
+
+static MCDisassembler *createMipsDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI) {
+  return new MipsDisassembler(STI,true);
+}
+
+static MCDisassembler *createMipselDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI) {
+  return new MipsDisassembler(STI,false);
+}
+
+static MCDisassembler *createMips64Disassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI) {
+  return new Mips64Disassembler(STI,true);
+}
+
+static MCDisassembler *createMips64elDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI) {
+  return new Mips64Disassembler(STI, false);
+}
+
+extern "C" void LLVMInitializeMipsDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheMipsTarget,
+                                         createMipsDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheMipselTarget,
+                                         createMipselDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheMips64Target,
+                                         createMips64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheMips64elTarget,
+                                         createMips64elDisassembler);
+}
+
+
+#include "MipsGenDisassemblerTables.inc"
+
+  /// readInstruction - read four bytes from the MemoryObject
+  /// and return 32 bit word sorted according to the given endianess
+static DecodeStatus readInstruction32(const MemoryObject &region,
+                                      uint64_t address,
+                                      uint64_t &size,
+                                      uint32_t &insn,
+                                      bool isBigEndian) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 4 Bytes of data.
+  if (region.readBytes(address, 4, (uint8_t*)Bytes, NULL) == -1) {
+    size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  if (isBigEndian) {
+    // Encoded as a big-endian 32-bit word in the stream.
+    insn = (Bytes[3] <<  0) |
+           (Bytes[2] <<  8) |
+           (Bytes[1] << 16) |
+           (Bytes[0] << 24);
+  }
+  else {
+    // Encoded as a small-endian 32-bit word in the stream.
+    insn = (Bytes[0] <<  0) |
+           (Bytes[1] <<  8) |
+           (Bytes[2] << 16) |
+           (Bytes[3] << 24);
+  }
+
+  return MCDisassembler::Success;
+}
+
+DecodeStatus
+MipsDisassembler::getInstruction(MCInst &instr,
+                                 uint64_t &Size,
+                                 const MemoryObject &Region,
+                                 uint64_t Address,
+                                 raw_ostream &vStream,
+                                 raw_ostream &cStream) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Region, Address, Size,
+                                          Insn, isBigEndian);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Calling the auto-generated decoder function.
+  Result = decodeMipsInstruction32(instr, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+DecodeStatus
+Mips64Disassembler::getInstruction(MCInst &instr,
+                                   uint64_t &Size,
+                                   const MemoryObject &Region,
+                                   uint64_t Address,
+                                   raw_ostream &vStream,
+                                   raw_ostream &cStream) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Region, Address, Size,
+                                          Insn, isBigEndian);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Calling the auto-generated decoder function.
+  Result = decodeMips64Instruction32(instr, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+  // If we fail to decode in Mips64 decoder space we can try in Mips32
+  Result = decodeMipsInstruction32(instr, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateReg(CPU64RegsTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateReg(FGR64RegsTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateReg(FGR32RegsTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateReg(RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  int Reg = (int)fieldFromInstruction32(Insn, 16, 5);
+  int Base = (int)fieldFromInstruction32(Insn, 21, 5);
+
+  if(Inst.getOpcode() == Mips::SC){
+    Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Reg]));
+  }
+
+  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Reg]));
+  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem(MCInst &Inst,
+                               unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  int Reg = (int)fieldFromInstruction32(Insn, 16, 5);
+  int Base = (int)fieldFromInstruction32(Insn, 21, 5);
+
+  Inst.addOperand(MCOperand::CreateReg(FGR64RegsTable[Reg]));
+  Inst.addOperand(MCOperand::CreateReg(CPURegsTable[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  // Currently only hardware register 29 is supported.
+  if (RegNo != 29)
+    return  MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCondCode(MCInst &Inst,
+                                   unsigned Insn,
+                                   uint64_t Address,
+                                   const void *Decoder) {
+  int CondCode = Insn & 0xf;
+  Inst.addOperand(MCOperand::CreateImm(CondCode));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateReg(AFGR64RegsTable[RegNo]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  //Currently only hardware register 29 is supported
+  if (RegNo != 29)
+    return  MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+                                       unsigned Offset,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  unsigned BranchOffset = Offset & 0xffff;
+  BranchOffset = SignExtend32<18>(BranchOffset << 2) + 4;
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBC1(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  unsigned BranchOffset = Insn & 0xffff;
+  BranchOffset = SignExtend32<18>(BranchOffset << 2) + 4;
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+
+  unsigned JumpOffset = fieldFromInstruction32(Insn, 0, 26) << 2;
+  Inst.addOperand(MCOperand::CreateImm(JumpOffset));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeSimm16(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Insn)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  // First we need to grab the pos(lsb) from MCInst.
+  int Pos = Inst.getOperand(2).getImm();
+  int Size = (int) Insn - Pos + 1;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Size)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeExtSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  int Size = (int) Insn  + 1;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Size)));
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 2917a89..6886b17 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -16,12 +16,12 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-#define GET_INSTRUCTION_NAME
 #include "MipsGenAsmWriter.inc"
 
 const char* Mips::MipsFCCToString(Mips::CondCode CC) {
@@ -62,10 +62,6 @@ const char* Mips::MipsFCCToString(Mips::CondCode CC) {
   llvm_unreachable("Impossible condition code!");
 }
 
-StringRef MipsInstPrinter::getOpcodeName(unsigned Opcode) const {
-  return getInstructionName(Opcode);
-}
-
 void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << '$' << StringRef(getRegisterName(RegNo)).lower();
 }
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index acd761d..76b839b 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -77,15 +77,14 @@ class TargetMachine;
 
 class MipsInstPrinter : public MCInstPrinter {
 public:
-  MipsInstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI) :
-    MCInstPrinter(MAI, MRI) {}
+  MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
-  static const char *getInstructionName(unsigned Opcode);
   static const char *getRegisterName(unsigned RegNo);
 
-  virtual StringRef getOpcodeName(unsigned Opcode) const;
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
diff --git a/lib/Target/Mips/LLVMBuild.txt b/lib/Target/Mips/LLVMBuild.txt
index abbed8c..a95d6bc 100644
--- a/lib/Target/Mips/LLVMBuild.txt
+++ b/lib/Target/Mips/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
@@ -24,6 +24,7 @@ name = Mips
 parent = Target
 has_asmparser = 1
 has_asmprinter = 1
+has_disassembler = 1
 has_jit = 1
 
 [component_1]
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 9d5a2f1..9b4caf6 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -14,19 +14,13 @@
 
 #include "MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Object/MachOFormat.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -61,7 +55,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Mips::fixup_Mips_HI16:
   case Mips::fixup_Mips_GOT_Local:
     // Get the higher 16-bits. Also add 1 if bit 15 is 1.
-    Value = (Value >> 16) + ((Value & 0x8000) != 0);
+    Value = ((Value + 0x8000) >> 16) & 0xffff;
     break;
   }
 
@@ -72,13 +66,15 @@ namespace {
 class MipsAsmBackend : public MCAsmBackend {
   Triple::OSType OSType;
   bool IsLittle; // Big or little endian
+  bool Is64Bit;  // 32 or 64 bit words
 
 public:
-  MipsAsmBackend(const Target &T,  Triple::OSType _OSType, bool _isLittle) :
-    MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle) {}
+  MipsAsmBackend(const Target &T,  Triple::OSType _OSType,
+                 bool _isLittle, bool _is64Bit)
+    :MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle), Is64Bit(_is64Bit) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createMipsELFObjectWriter(OS, OSType, IsLittle);
+    return createMipsELFObjectWriter(OS, OSType, IsLittle, Is64Bit);
   }
 
   /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
@@ -120,7 +116,7 @@ public:
     }
 
     uint64_t Mask = ((uint64_t)(-1) >> (64 - getFixupKindInfo(Kind).TargetSize));
-    CurVal = (CurVal & ~Mask) | ((CurVal + Value) & Mask);
+    CurVal |= Value & Mask;
 
     // Write out the fixed up bytes back to the code/data bits.
     for (unsigned i = 0; i != NumBytes; ++i) {
@@ -212,17 +208,28 @@ public:
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     return true;
   }
-};
+}; // class MipsAsmBackend
 
 } // namespace
 
 // MCAsmBackend
-MCAsmBackend *llvm::createMipsAsmBackendEL(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, StringRef TT) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
-                            /*IsLittle*/true);
+                            /*IsLittle*/true, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, StringRef TT) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
-                            /*IsLittle*/false);
+                            /*IsLittle*/false, /*Is64Bit*/false);
 }
+
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, StringRef TT) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/true, /*Is64Bit*/true);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, StringRef TT) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/false, /*Is64Bit*/true);
+}
+
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index 34e3a6e..fb1c5ce 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -14,7 +14,9 @@
 #ifndef MIPSBASEINFO_H
 #define MIPSBASEINFO_H
 
+#include "MipsFixupKinds.h"
 #include "MipsMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -198,6 +200,34 @@ inline static unsigned getMipsRegisterNumbering(unsigned RegEnum)
   default: llvm_unreachable("Unknown register number!");
   }
 }
+
+inline static std::pair<const MCSymbolRefExpr*, int64_t>
+MipsGetSymAndOffset(const MCFixup &Fixup) {
+  MCFixupKind FixupKind = Fixup.getKind();
+
+  if ((FixupKind < FirstTargetFixupKind) ||
+      (FixupKind >= MCFixupKind(Mips::LastTargetFixupKind)))
+    return std::make_pair((const MCSymbolRefExpr*)0, (int64_t)0);
+
+  const MCExpr *Expr = Fixup.getValue();
+  MCExpr::ExprKind Kind = Expr->getKind();
+
+  if (Kind == MCExpr::Binary) {
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr*>(Expr);
+    const MCExpr *LHS = BE->getLHS();
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+
+    if ((LHS->getKind() != MCExpr::SymbolRef) || !CE)
+      return std::make_pair((const MCSymbolRefExpr*)0, (int64_t)0);
+
+    return std::make_pair(cast<MCSymbolRefExpr>(LHS), CE->getValue());
+  }
+
+  if (Kind != MCExpr::SymbolRef)
+    return std::make_pair((const MCSymbolRefExpr*)0, (int64_t)0);
+
+  return std::make_pair(cast<MCSymbolRefExpr>(Expr), 0);
+}
 }
 
 #endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 076a6a8..2091bec 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -7,20 +7,34 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <list>
 
 using namespace llvm;
 
 namespace {
+  struct RelEntry {
+    RelEntry(const ELFRelocationEntry &R, const MCSymbol *S, int64_t O) :
+      Reloc(R), Sym(S), Offset(O) {}
+    ELFRelocationEntry Reloc;
+    const MCSymbol *Sym;
+    int64_t Offset;
+  };
+
+  typedef std::list<RelEntry> RelLs;
+  typedef RelLs::iterator RelLsIter;
+
   class MipsELFObjectWriter : public MCELFObjectTargetWriter {
   public:
-    MipsELFObjectWriter(uint8_t OSABI);
+    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI);
 
     virtual ~MipsELFObjectWriter();
 
@@ -33,18 +47,28 @@ namespace {
                                            const MCFragment &F,
                                            const MCFixup &Fixup,
                                            bool IsPCRel) const;
+    virtual void sortRelocs(const MCAssembler &Asm,
+                            std::vector<ELFRelocationEntry> &Relocs);
   };
 }
 
-MipsELFObjectWriter::MipsELFObjectWriter(uint8_t OSABI)
-  : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_MIPS,
+MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI)
+  : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
                             /*HasRelocationAddend*/ false) {}
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}
 
-// FIXME: get the real EABI Version from the Triple.
+// FIXME: get the real EABI Version from the Subtarget class.
 unsigned MipsELFObjectWriter::getEFlags() const {
-  return ELF::EF_MIPS_NOREORDER | ELF::EF_MIPS_ARCH_32R2;
+
+  // FIXME: We can't tell if we are PIC (dynamic) or CPIC (static)
+  unsigned Flag = ELF::EF_MIPS_NOREORDER;
+
+  if (is64Bit())
+    Flag |= ELF::EF_MIPS_ARCH_64R2;
+  else
+    Flag |= ELF::EF_MIPS_ARCH_32R2;
+  return Flag;
 }
 
 const MCSymbol *MipsELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
@@ -129,8 +153,97 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   return Type;
 }
 
-MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
-                                                bool IsLittleEndian) {
-  MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(OSABI);
+// Return true if R is either a GOT16 against a local symbol or HI16.
+static bool NeedsMatchingLo(const MCAssembler &Asm, const RelEntry &R) {
+  if (!R.Sym)
+    return false;
+
+  MCSymbolData &SD = Asm.getSymbolData(R.Sym->AliasedSymbol());
+
+  return ((R.Reloc.Type == ELF::R_MIPS_GOT16) && !SD.isExternal()) ||
+    (R.Reloc.Type == ELF::R_MIPS_HI16);
+}
+
+static bool HasMatchingLo(const MCAssembler &Asm, RelLsIter I, RelLsIter Last) {
+  if (I == Last)
+    return false;
+
+  RelLsIter Hi = I++;
+
+  return (I->Reloc.Type == ELF::R_MIPS_LO16) && (Hi->Sym == I->Sym) &&
+    (Hi->Offset == I->Offset);
+}
+
+static bool HasSameSymbol(const RelEntry &R0, const RelEntry &R1) {
+  return R0.Sym == R1.Sym;
+}
+
+static int CompareOffset(const RelEntry &R0, const RelEntry &R1) {
+  return (R0.Offset > R1.Offset) ? 1 : ((R0.Offset == R1.Offset) ? 0 : -1);
+}
+
+void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
+                                     std::vector<ELFRelocationEntry> &Relocs) {
+  // Call the defualt function first. Relocations are sorted in descending
+  // order of r_offset.
+  MCELFObjectTargetWriter::sortRelocs(Asm, Relocs);
+  
+  RelLs RelocLs;
+  std::vector<RelLsIter> Unmatched;
+
+  // Fill RelocLs. Traverse Relocs backwards so that relocations in RelocLs
+  // are in ascending order of r_offset.
+  for (std::vector<ELFRelocationEntry>::reverse_iterator R = Relocs.rbegin();
+       R != Relocs.rend(); ++R) {
+     std::pair<const MCSymbolRefExpr*, int64_t> P =
+       MipsGetSymAndOffset(*R->Fixup);
+     RelocLs.push_back(RelEntry(*R, P.first ? &P.first->getSymbol() : 0,
+                                P.second));
+  }
+
+  // Get list of unmatched HI16 and GOT16.
+  for (RelLsIter R = RelocLs.begin(); R != RelocLs.end(); ++R)
+    if (NeedsMatchingLo(Asm, *R) && !HasMatchingLo(Asm, R, --RelocLs.end()))
+      Unmatched.push_back(R);
+
+  // Insert unmatched HI16 and GOT16 immediately before their matching LO16.
+  for (std::vector<RelLsIter>::iterator U = Unmatched.begin();
+       U != Unmatched.end(); ++U) {
+    RelLsIter LoPos = RelocLs.end(), HiPos = *U;
+    bool MatchedLo = false;
+
+    for (RelLsIter R = RelocLs.begin(); R != RelocLs.end(); ++R) {
+      if ((R->Reloc.Type == ELF::R_MIPS_LO16) && HasSameSymbol(*HiPos, *R) &&
+          (CompareOffset(*R, *HiPos) >= 0) &&
+          ((LoPos == RelocLs.end()) || ((CompareOffset(*R, *LoPos) < 0)) ||
+           (!MatchedLo && !CompareOffset(*R, *LoPos))))
+        LoPos = R;
+
+      MatchedLo = NeedsMatchingLo(Asm, *R) &&
+        HasMatchingLo(Asm, R, --RelocLs.end());
+    }
+
+    // If a matching LoPos was found, move HiPos and insert it before LoPos.
+    // Make the offsets of HiPos and LoPos match.
+    if (LoPos != RelocLs.end()) {
+      HiPos->Offset = LoPos->Offset;
+      RelocLs.insert(LoPos, *HiPos);
+      RelocLs.erase(HiPos);
+    }
+  }
+
+  // Put the sorted list back in reverse order.
+  assert(Relocs.size() == RelocLs.size());
+  unsigned I = RelocLs.size();
+
+  for (RelLsIter R = RelocLs.begin(); R != RelocLs.end(); ++R)
+    Relocs[--I] = R->Reloc;
+}
+
+MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS,
+                                                uint8_t OSABI,
+                                                bool IsLittleEndian,
+                                                bool Is64Bit) {
+  MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI);
   return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index ef4c6e2..e1d8789 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -14,10 +14,10 @@
 #ifndef MIPSTARGETASMINFO_H
 #define MIPSTARGETASMINFO_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
+  class StringRef;
   class Target;
 
   class MipsMCAsmInfo : public MCAsmInfo {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 9ebb6d2..27954b1 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -179,73 +179,71 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   } else if (MO.isFPImm()) {
     return static_cast<unsigned>(APFloat(MO.getFPImm())
         .bitcastToAPInt().getHiBits(32).getLimitedValue());
-  } else if (MO.isExpr()) {
-    const MCExpr *Expr = MO.getExpr();
-    MCExpr::ExprKind Kind = Expr->getKind();
-    unsigned Ret = 0;
-
-    if (Kind == MCExpr::Binary) {
-      const MCBinaryExpr *BE = static_cast<const MCBinaryExpr*>(Expr);
-      Expr = BE->getLHS();
-      Kind = Expr->getKind();
-      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
-      assert((Kind == MCExpr::SymbolRef) && CE &&
-             "Binary expression must be sym+const.");
-      Ret = CE->getValue();
-    }
+  } 
+
+  // MO must be an Expr.
+  assert(MO.isExpr());
+
+  const MCExpr *Expr = MO.getExpr();
+  MCExpr::ExprKind Kind = Expr->getKind();
 
-    if (Kind == MCExpr::SymbolRef) {
-      Mips::Fixups FixupKind;
-
-      switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
-      case MCSymbolRefExpr::VK_Mips_GPREL:
-        FixupKind = Mips::fixup_Mips_GPREL16;
-        break;
-      case MCSymbolRefExpr::VK_Mips_GOT_CALL:
-        FixupKind = Mips::fixup_Mips_CALL16;
-        break;
-      case MCSymbolRefExpr::VK_Mips_GOT16:
-        FixupKind = Mips::fixup_Mips_GOT_Global;
-        break;
-      case MCSymbolRefExpr::VK_Mips_GOT:
-        FixupKind = Mips::fixup_Mips_GOT_Local;
-        break;
-      case MCSymbolRefExpr::VK_Mips_ABS_HI:
-        FixupKind = Mips::fixup_Mips_HI16;
-        break;
-      case MCSymbolRefExpr::VK_Mips_ABS_LO:
-        FixupKind = Mips::fixup_Mips_LO16;
-        break;
-      case MCSymbolRefExpr::VK_Mips_TLSGD:
-        FixupKind = Mips::fixup_Mips_TLSGD;
-        break;
-      case MCSymbolRefExpr::VK_Mips_TLSLDM:
-        FixupKind = Mips::fixup_Mips_TLSLDM;
-        break;
-      case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
-        FixupKind = Mips::fixup_Mips_DTPREL_HI;
-        break;
-      case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
-        FixupKind = Mips::fixup_Mips_DTPREL_LO;
-        break;
-      case MCSymbolRefExpr::VK_Mips_GOTTPREL:
-        FixupKind = Mips::fixup_Mips_GOTTPREL;
-        break;
-      case MCSymbolRefExpr::VK_Mips_TPREL_HI:
-        FixupKind = Mips::fixup_Mips_TPREL_HI;
-        break;
-      case MCSymbolRefExpr::VK_Mips_TPREL_LO:
-        FixupKind = Mips::fixup_Mips_TPREL_LO;
-        break;
-      default:
-        return Ret;
-      } // switch
-      Fixups.push_back(MCFixup::Create(0, Expr, MCFixupKind(FixupKind)));
-    } // if SymbolRef
-    // All of the information is in the fixup.
-    return Ret;
+  if (Kind == MCExpr::Binary) {
+    Expr = static_cast<const MCBinaryExpr*>(Expr)->getLHS();
+    Kind = Expr->getKind();
   }
-  llvm_unreachable("Unable to encode MCOperand!");
+
+  assert (Kind == MCExpr::SymbolRef);
+    
+  Mips::Fixups FixupKind;
+
+  switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
+  case MCSymbolRefExpr::VK_Mips_GPREL:
+    FixupKind = Mips::fixup_Mips_GPREL16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_CALL:
+    FixupKind = Mips::fixup_Mips_CALL16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT16:
+    FixupKind = Mips::fixup_Mips_GOT_Global;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT:
+    FixupKind = Mips::fixup_Mips_GOT_Local;
+    break;
+  case MCSymbolRefExpr::VK_Mips_ABS_HI:
+    FixupKind = Mips::fixup_Mips_HI16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_ABS_LO:
+    FixupKind = Mips::fixup_Mips_LO16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_TLSGD:
+    FixupKind = Mips::fixup_Mips_TLSGD;
+    break;
+  case MCSymbolRefExpr::VK_Mips_TLSLDM:
+    FixupKind = Mips::fixup_Mips_TLSLDM;
+    break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
+    FixupKind = Mips::fixup_Mips_DTPREL_HI;
+    break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
+    FixupKind = Mips::fixup_Mips_DTPREL_LO;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOTTPREL:
+    FixupKind = Mips::fixup_Mips_GOTTPREL;
+    break;
+  case MCSymbolRefExpr::VK_Mips_TPREL_HI:
+    FixupKind = Mips::fixup_Mips_TPREL_HI;
+    break;
+  case MCSymbolRefExpr::VK_Mips_TPREL_LO:
+    FixupKind = Mips::fixup_Mips_TPREL_LO;
+    break;
+  default:
+    break;
+  } // switch
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), MCFixupKind(FixupKind)));
+
+  // All of the information is in the fixup.
+  return 0;
 }
 
 /// getMemEncoding - Return binary encoding of memory related operand.
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 7652675..f634f08 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -34,6 +34,38 @@
 
 using namespace llvm;
 
+static std::string ParseMipsTriple(StringRef TT, StringRef CPU) {
+  std::string MipsArchFeature;
+  size_t DashPosition = 0;
+  StringRef TheTriple;
+
+  // Let's see if there is a dash, like mips-unknown-linux.
+  DashPosition = TT.find('-');
+
+  if (DashPosition == StringRef::npos) {
+    // No dash, we check the string size.
+    TheTriple = TT.substr(0);
+  } else {
+    // We are only interested in substring before dash.
+    TheTriple = TT.substr(0,DashPosition);
+  }
+
+  if (TheTriple == "mips" || TheTriple == "mipsel") {
+    if (CPU.empty() || CPU == "mips32") {
+      MipsArchFeature = "+mips32";
+    } else if (CPU == "mips32r2") {
+      MipsArchFeature = "+mips32r2";
+    }
+  } else {
+      if (CPU.empty() || CPU == "mips64") {
+        MipsArchFeature = "+mips64";
+      } else if (CPU == "mips64r2") {
+        MipsArchFeature = "+mips64r2";
+      }
+  }
+  return MipsArchFeature;
+}
+
 static MCInstrInfo *createMipsMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitMipsMCInstrInfo(X);
@@ -48,8 +80,15 @@ static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
 
 static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                   StringRef FS) {
+  std::string ArchFS = ParseMipsTriple(TT,CPU);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = ArchFS + "," + FS.str();
+    else
+      ArchFS = FS;
+  }
   MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitMipsMCSubtargetInfo(X, TT, CPU, FS);
+  InitMipsMCSubtargetInfo(X, TT, CPU, ArchFS);
   return X;
 }
 
@@ -67,7 +106,9 @@ static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM,
                                               CodeModel::Model CM,
                                               CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (RM == Reloc::Default)
+  if (CM == CodeModel::JITDefault)
+    RM = Reloc::Static;
+  else if (RM == Reloc::Default)
     RM = Reloc::PIC_;
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
@@ -76,9 +117,10 @@ static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM,
 static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
                                               unsigned SyntaxVariant,
                                               const MCAsmInfo &MAI,
+                                              const MCInstrInfo &MII,
                                               const MCRegisterInfo &MRI,
                                               const MCSubtargetInfo &STI) {
-  return new MipsInstPrinter(MAI, MRI);
+  return new MipsInstPrinter(MAI, MII, MRI);
 }
 
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
@@ -142,13 +184,13 @@ extern "C" void LLVMInitializeMipsTargetMC() {
 
   // Register the asm backend.
   TargetRegistry::RegisterMCAsmBackend(TheMipsTarget,
-                                       createMipsAsmBackendEB);
+                                       createMipsAsmBackendEB32);
   TargetRegistry::RegisterMCAsmBackend(TheMipselTarget,
-                                       createMipsAsmBackendEL);
+                                       createMipsAsmBackendEL32);
   TargetRegistry::RegisterMCAsmBackend(TheMips64Target,
-                                       createMipsAsmBackendEB);
+                                       createMipsAsmBackendEB64);
   TargetRegistry::RegisterMCAsmBackend(TheMips64elTarget,
-                                       createMipsAsmBackendEL);
+                                       createMipsAsmBackendEL64);
 
   // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(TheMipsTarget,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 2e58f9d..547ccdd 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -39,12 +39,15 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
-MCAsmBackend *createMipsAsmBackendEB(const Target &T, StringRef TT);
-MCAsmBackend *createMipsAsmBackendEL(const Target &T, StringRef TT);
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T, StringRef TT);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T, StringRef TT);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T, StringRef TT);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T, StringRef TT);
 
 MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
                                           uint8_t OSABI,
-                                          bool IsLittleEndian);
+                                          bool IsLittleEndian,
+                                          bool Is64Bit);
 } // End llvm namespace
 
 // Defines symbolic names for Mips registers.  This defines a mapping from
diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
index 168635c..596f071 100644
--- a/lib/Target/Mips/Makefile
+++ b/lib/Target/Mips/Makefile
@@ -15,9 +15,9 @@ TARGET = Mips
 BUILT_SOURCES = MipsGenRegisterInfo.inc MipsGenInstrInfo.inc \
                 MipsGenAsmWriter.inc MipsGenCodeEmitter.inc \
                 MipsGenDAGISel.inc MipsGenCallingConv.inc \
-                MipsGenSubtargetInfo.inc MipsGenMCCodeEmitter.inc
-
-DIRS = InstPrinter AsmParser TargetInfo MCTargetDesc
+                MipsGenSubtargetInfo.inc MipsGenMCCodeEmitter.inc \
+                MipsGenEDInfo.inc MipsGenDisassemblerTables.inc
+DIRS = InstPrinter Disassembler AsmParser TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 427e8d9..0382869 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -36,6 +36,7 @@ def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
 //===----------------------------------------------------------------------===//
 // Shifts
 // 64-bit shift instructions.
+let DecoderNamespace = "Mips64" in {
 class shift_rotate_imm64<bits<6> func, bits<5> isRotate, string instr_asm,
                          SDNode OpNode>:
   shift_rotate_imm<func, isRotate, instr_asm, OpNode, immZExt6, shamt,
@@ -49,16 +50,21 @@ class Div64<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
 
 multiclass Atomic2Ops64<PatFrag Op, string Opstr> {
   def #NAME# : Atomic2Ops<Op, Opstr, CPU64Regs, CPURegs>, Requires<[NotN64]>;
-  def _P8    : Atomic2Ops<Op, Opstr, CPU64Regs, CPU64Regs>, Requires<[IsN64]>;
+  def _P8    : Atomic2Ops<Op, Opstr, CPU64Regs, CPU64Regs>, Requires<[IsN64]> {
+    let isCodeGenOnly = 1;
+  }
 }
 
 multiclass AtomicCmpSwap64<PatFrag Op, string Width>  {
   def #NAME# : AtomicCmpSwap<Op, Width, CPU64Regs, CPURegs>, Requires<[NotN64]>;
   def _P8    : AtomicCmpSwap<Op, Width, CPU64Regs, CPU64Regs>,
-               Requires<[IsN64]>;
+               Requires<[IsN64]> {
+    let isCodeGenOnly = 1;
+  }
 }
-
-let usesCustomInserter = 1, Predicates = [HasMips64] in {
+}
+let usesCustomInserter = 1, Predicates = [HasMips64],
+  DecoderNamespace = "Mips64" in {
   defm ATOMIC_LOAD_ADD_I64  : Atomic2Ops64<atomic_load_add_64, "load_add_64">;
   defm ATOMIC_LOAD_SUB_I64  : Atomic2Ops64<atomic_load_sub_64, "load_sub_64">;
   defm ATOMIC_LOAD_AND_I64  : Atomic2Ops64<atomic_load_and_64, "load_and_64">;
@@ -72,7 +78,7 @@ let usesCustomInserter = 1, Predicates = [HasMips64] in {
 //===----------------------------------------------------------------------===//
 // Instruction definition
 //===----------------------------------------------------------------------===//
-
+let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
 def DADDiu   : ArithLogicI<0x19, "daddiu", add, simm16_64, immSExt16,
                            CPU64Regs>;
@@ -97,16 +103,17 @@ def NOR64    : LogicNOR<0x00, 0x27, "nor", CPU64Regs>;
 def DSLL     : shift_rotate_imm64<0x38, 0x00, "dsll", shl>;
 def DSRL     : shift_rotate_imm64<0x3a, 0x00, "dsrl", srl>;
 def DSRA     : shift_rotate_imm64<0x3b, 0x00, "dsra", sra>;
-def DSLLV    : shift_rotate_reg<0x24, 0x00, "dsllv", shl, CPU64Regs>;
-def DSRLV    : shift_rotate_reg<0x26, 0x00, "dsrlv", srl, CPU64Regs>;
-def DSRAV    : shift_rotate_reg<0x27, 0x00, "dsrav", sra, CPU64Regs>;
-
+def DSLLV    : shift_rotate_reg<0x14, 0x00, "dsllv", shl, CPU64Regs>;
+def DSRLV    : shift_rotate_reg<0x16, 0x00, "dsrlv", srl, CPU64Regs>;
+def DSRAV    : shift_rotate_reg<0x17, 0x00, "dsrav", sra, CPU64Regs>;
+}
 // Rotate Instructions
-let Predicates = [HasMips64r2] in {
+let Predicates = [HasMips64r2], DecoderNamespace = "Mips64" in {
   def DROTR    : shift_rotate_imm64<0x3a, 0x01, "drotr", rotr>;
   def DROTRV   : shift_rotate_reg<0x16, 0x01, "drotrv", rotr, CPU64Regs>;
 }
 
+let DecoderNamespace = "Mips64" in {
 /// Load and Store Instructions
 ///  aligned
 defm LB64    : LoadM64<0x20, "lb",  sextloadi8>;
@@ -132,9 +139,13 @@ defm USD       : StoreM64<0x3f, "usd", store_u, 1>;
 
 /// Load-linked, Store-conditional
 def LLD    : LLBase<0x34, "lld", CPU64Regs, mem>, Requires<[NotN64]>;
-def LLD_P8 : LLBase<0x34, "lld", CPU64Regs, mem64>, Requires<[IsN64]>;
+def LLD_P8 : LLBase<0x34, "lld", CPU64Regs, mem64>, Requires<[IsN64]> {
+  let isCodeGenOnly = 1;
+}
 def SCD    : SCBase<0x3c, "scd", CPU64Regs, mem>, Requires<[NotN64]>;
-def SCD_P8 : SCBase<0x3c, "scd", CPU64Regs, mem64>, Requires<[IsN64]>;
+def SCD_P8 : SCBase<0x3c, "scd", CPU64Regs, mem64>, Requires<[IsN64]> {
+  let isCodeGenOnly = 1;
+}
 
 /// Jump and Branch Instructions
 def JR64   : JumpFR<0x00, 0x08, "jr", CPU64Regs>;
@@ -142,11 +153,13 @@ def BEQ64  : CBranch<0x04, "beq", seteq, CPU64Regs>;
 def BNE64  : CBranch<0x05, "bne", setne, CPU64Regs>;
 def BGEZ64 : CBranchZero<0x01, 1, "bgez", setge, CPU64Regs>;
 def BGTZ64 : CBranchZero<0x07, 0, "bgtz", setgt, CPU64Regs>;
-def BLEZ64 : CBranchZero<0x07, 0, "blez", setle, CPU64Regs>;
+def BLEZ64 : CBranchZero<0x06, 0, "blez", setle, CPU64Regs>;
 def BLTZ64 : CBranchZero<0x01, 0, "bltz", setlt, CPU64Regs>;
-
+}
+let DecoderNamespace = "Mips64" in
 def JALR64 : JumpLinkReg<0x00, 0x09, "jalr", CPU64Regs>;
 
+let DecoderNamespace = "Mips64" in {
 /// Multiply and Divide Instructions.
 def DMULT    : Mult64<0x1c, "dmult", IIImul>;
 def DMULTu   : Mult64<0x1d, "dmultu", IIImul>;
@@ -171,11 +184,13 @@ def DSBH : SubwordSwap<0x24, 0x2, "dsbh", CPU64Regs>;
 def DSHD : SubwordSwap<0x24, 0x5, "dshd", CPU64Regs>;
 
 def LEA_ADDiu64 : EffectiveAddress<"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>;
-
-let Uses = [SP_64] in
+}
+let Uses = [SP_64], DecoderNamespace = "Mips64" in
 def DynAlloc64 : EffectiveAddress<"daddiu\t$rt, $addr", CPU64Regs, mem_ea_64>,
-                 Requires<[IsN64]>;
-
+                 Requires<[IsN64]> {
+  let isCodeGenOnly = 1;
+}
+let DecoderNamespace = "Mips64" in {
 def RDHWR64 : ReadHardware<CPU64Regs, HWRegs64>;
 
 def DEXT : ExtBase<3, "dext", CPU64Regs>;
@@ -183,12 +198,12 @@ def DINS : InsBase<7, "dins", CPU64Regs>;
 
 def DSLL64_32 : FR<0x3c, 0x00, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
                    "dsll\t$rd, $rt, 32", [], IIAlu>;
-
 def SLL64_32 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
                   "sll\t$rd, $rt, 0", [], IIAlu>;
+let isCodeGenOnly = 1 in
 def SLL64_64 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPU64Regs:$rt),
                   "sll\t$rd, $rt, 0", [], IIAlu>;
-
+}
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index f2b842a..8206cfc 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -16,8 +16,6 @@
 #include "MipsAsmPrinter.h"
 #include "Mips.h"
 #include "MipsInstrInfo.h"
-#include "MipsMachineFunction.h"
-#include "MipsMCInstLower.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/ADT/SmallString.h"
@@ -45,24 +43,23 @@
 
 using namespace llvm;
 
-static bool isUnalignedLoadStore(unsigned Opc) {
-  return Opc == Mips::ULW    || Opc == Mips::ULH    || Opc == Mips::ULHu ||
-         Opc == Mips::USW    || Opc == Mips::USH    ||
-         Opc == Mips::ULW_P8 || Opc == Mips::ULH_P8 || Opc == Mips::ULHu_P8 ||
-         Opc == Mips::USW_P8 || Opc == Mips::USH_P8 ||
-         Opc == Mips::ULD    || Opc == Mips::ULW64  || Opc == Mips::ULH64 ||
-         Opc == Mips::ULHu64 || Opc == Mips::USD    || Opc == Mips::USW64 ||
-         Opc == Mips::USH64  ||
-         Opc == Mips::ULD_P8    || Opc == Mips::ULW64_P8  ||
-         Opc == Mips::ULH64_P8  || Opc == Mips::ULHu64_P8 ||
-         Opc == Mips::USD_P8    || Opc == Mips::USW64_P8  ||
-         Opc == Mips::USH64_P8;
+void MipsAsmPrinter::EmitInstrWithMacroNoAT(const MachineInstr *MI) {
+  MCInst TmpInst;
+
+  MCInstLowering.Lower(MI, TmpInst);
+  OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
+  if (MipsFI->getEmitNOAT())
+    OutStreamer.EmitRawText(StringRef("\t.set\tat"));
+  OutStreamer.EmitInstruction(TmpInst);
+  if (MipsFI->getEmitNOAT())
+    OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
+  OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
 }
 
-static bool isDirective(unsigned Opc) {
-  return Opc == Mips::MACRO   || Opc == Mips::NOMACRO ||
-         Opc == Mips::REORDER || Opc == Mips::NOREORDER ||
-         Opc == Mips::ATMACRO || Opc == Mips::NOAT;
+bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  MipsFI = MF.getInfo<MipsFunctionInfo>();
+  AsmPrinter::runOnMachineFunction(MF);
+  return true;
 }
 
 void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
@@ -74,49 +71,70 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  MipsMCInstLower MCInstLowering(Mang, *MF, *this);
   unsigned Opc = MI->getOpcode();
   MCInst TmpInst0;
   SmallVector<MCInst, 4> MCInsts;
-  MCInstLowering.Lower(MI, TmpInst0);
-
-  if (!OutStreamer.hasRawTextSupport() && isDirective(Opc))
-    return;
 
-  // Enclose unaligned load or store with .macro & .nomacro directives.
-  if (isUnalignedLoadStore(Opc)) {
+  switch (Opc) {
+  case Mips::ULW:
+  case Mips::ULH:
+  case Mips::ULHu:
+  case Mips::USW:
+  case Mips::USH:
+  case Mips::ULW_P8:
+  case Mips::ULH_P8:
+  case Mips::ULHu_P8:
+  case Mips::USW_P8:
+  case Mips::USH_P8:
+  case Mips::ULD:
+  case Mips::ULW64:
+  case Mips::ULH64:
+  case Mips::ULHu64:
+  case Mips::USD:
+  case Mips::USW64:
+  case Mips::USH64:
+  case Mips::ULD_P8:
+  case Mips::ULW64_P8:
+  case Mips::ULH64_P8:
+  case Mips::ULHu64_P8:
+  case Mips::USD_P8:
+  case Mips::USW64_P8:
+  case Mips::USH64_P8: {
     if (OutStreamer.hasRawTextSupport()) {
-      MCInst Directive;
-      Directive.setOpcode(Mips::MACRO);
-      OutStreamer.EmitInstruction(Directive);
-      OutStreamer.EmitInstruction(TmpInst0);
-      Directive.setOpcode(Mips::NOMACRO);
-      OutStreamer.EmitInstruction(Directive);
-    } else {
-      MCInstLowering.LowerUnalignedLoadStore(MI, MCInsts);
-      for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); I
-          != MCInsts.end(); ++I)
-        OutStreamer.EmitInstruction(*I);
+      EmitInstrWithMacroNoAT(MI);
+      return;
     }
+
+    MCInstLowering.LowerUnalignedLoadStore(MI, MCInsts);
+    for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); I
+           != MCInsts.end(); ++I)
+      OutStreamer.EmitInstruction(*I);
+
     return;
   }
+  case Mips::CPRESTORE: {
+    const MachineOperand &MO = MI->getOperand(0);
+    assert(MO.isImm() && "CPRESTORE's operand must be an immediate.");
+    int64_t Offset = MO.getImm();
 
-  if (!OutStreamer.hasRawTextSupport()) {
-    // Lower CPLOAD and CPRESTORE
-    if (Opc == Mips::CPLOAD)
-      MCInstLowering.LowerCPLOAD(MI, MCInsts);
-    else if (Opc == Mips::CPRESTORE)
-      MCInstLowering.LowerCPRESTORE(MI, MCInsts);
+    if (OutStreamer.hasRawTextSupport()) {
+      if (!isInt<16>(Offset)) {
+        EmitInstrWithMacroNoAT(MI);
+        return;
+      }
+    } else {
+      MCInstLowering.LowerCPRESTORE(Offset, MCInsts);
 
-    if (!MCInsts.empty()) {
       for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
            I != MCInsts.end(); ++I)
         OutStreamer.EmitInstruction(*I);
+
       return;
     }
-  }
 
-  if (Opc == Mips::SETGP01) {
+    break;
+  }
+  case Mips::SETGP01: {
     MCInstLowering.LowerSETGP01(MI, MCInsts);
 
     for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
@@ -125,7 +143,11 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     return;
   }
+  default:
+    break;
+  }
 
+  MCInstLowering.Lower(MI, TmpInst0);
   OutStreamer.EmitInstruction(TmpInst0);
 }
 
@@ -269,13 +291,35 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
 /// EmitFunctionBodyStart - Targets can override this to emit stuff before
 /// the first basic block in the function.
 void MipsAsmPrinter::EmitFunctionBodyStart() {
+  MCInstLowering.Initialize(Mang, &MF->getContext());
+
   emitFrameDirective();
 
+  bool EmitCPLoad = (MF->getTarget().getRelocationModel() == Reloc::PIC_) &&
+    Subtarget->isABI_O32() && MipsFI->globalBaseRegSet() &&
+    MipsFI->globalBaseRegFixed();
+
   if (OutStreamer.hasRawTextSupport()) {
     SmallString<128> Str;
     raw_svector_ostream OS(Str);
     printSavedRegsBitmask(OS);
     OutStreamer.EmitRawText(OS.str());
+
+    OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder"));
+
+    // Emit .cpload directive if needed.
+    if (EmitCPLoad)
+      OutStreamer.EmitRawText(StringRef("\t.cpload\t$25"));
+
+    OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
+    if (MipsFI->getEmitNOAT())
+      OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
+  } else if (EmitCPLoad) {
+    SmallVector<MCInst, 4> MCInsts;
+    MCInstLowering.LowerCPLOAD(MCInsts);
+    for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
+         I != MCInsts.end(); ++I)
+      OutStreamer.EmitInstruction(*I);
   }
 }
 
@@ -286,6 +330,9 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
   // always be at the function end, and we can't emit and
   // break with BB logic.
   if (OutStreamer.hasRawTextSupport()) {
+    if (MipsFI->getEmitNOAT())
+      OutStreamer.EmitRawText(StringRef("\t.set\tat"));
+
     OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
     OutStreamer.EmitRawText(StringRef("\t.set\treorder"));
     OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 473da7e..562bf9c 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -14,6 +14,8 @@
 #ifndef MIPSASMPRINTER_H
 #define MIPSASMPRINTER_H
 
+#include "MipsMachineFunction.h"
+#include "MipsMCInstLower.h"
 #include "MipsSubtarget.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/Support/Compiler.h"
@@ -28,12 +30,16 @@ class raw_ostream;
 
 class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
 
+  void EmitInstrWithMacroNoAT(const MachineInstr *MI);
+
 public:
 
   const MipsSubtarget *Subtarget;
+  const MipsFunctionInfo *MipsFI;
+  MipsMCInstLower MCInstLowering;
 
   explicit MipsAsmPrinter(TargetMachine &TM,  MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {
+    : AsmPrinter(TM, Streamer), MCInstLowering(*this) {
     Subtarget = &TM.getSubtarget<MipsSubtarget>();
   }
 
@@ -41,6 +47,8 @@ public:
     return "Mips Assembly Printer";
   }
 
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
   void EmitInstruction(const MachineInstr *MI);
   void printSavedRegsBitmask(raw_ostream &O);
   void printHex32(unsigned int Value, raw_ostream &O);
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 075a3e8..da33680 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -95,45 +95,65 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 
 // Instantiation of instructions.
 def MOVZ_I_I     : CondMovIntInt<CPURegs, CPURegs, 0x0a, "movz">;
-let Predicates = [HasMips64] in {
+let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
   def MOVZ_I_I64   : CondMovIntInt<CPURegs, CPU64Regs, 0x0a, "movz">;
-  def MOVZ_I64_I   : CondMovIntInt<CPU64Regs, CPURegs, 0x0a, "movz">;
-  def MOVZ_I64_I64 : CondMovIntInt<CPU64Regs, CPU64Regs, 0x0a, "movz">;
+  def MOVZ_I64_I   : CondMovIntInt<CPU64Regs, CPURegs, 0x0a, "movz"> {
+    let isCodeGenOnly = 1;
+  }
+  def MOVZ_I64_I64 : CondMovIntInt<CPU64Regs, CPU64Regs, 0x0a, "movz"> {
+    let isCodeGenOnly = 1;
+  }
 }
 
 def MOVN_I_I     : CondMovIntInt<CPURegs, CPURegs, 0x0b, "movn">;
-let Predicates = [HasMips64] in {
+let Predicates = [HasMips64],DecoderNamespace = "Mips64" in {
   def MOVN_I_I64   : CondMovIntInt<CPURegs, CPU64Regs, 0x0b, "movn">;
-  def MOVN_I64_I   : CondMovIntInt<CPU64Regs, CPURegs, 0x0b, "movn">;
-  def MOVN_I64_I64 : CondMovIntInt<CPU64Regs, CPU64Regs, 0x0b, "movn">;
+  def MOVN_I64_I   : CondMovIntInt<CPU64Regs, CPURegs, 0x0b, "movn"> {
+    let isCodeGenOnly = 1;
+  }
+  def MOVN_I64_I64 : CondMovIntInt<CPU64Regs, CPU64Regs, 0x0b, "movn"> {
+    let isCodeGenOnly = 1;
+  }
 }
 
 def MOVZ_I_S   : CondMovIntFP<CPURegs, FGR32, 16, 18, "movz.s">;
 def MOVZ_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 18, "movz.s">,
-                 Requires<[HasMips64]>;
+                 Requires<[HasMips64]> {
+  let DecoderNamespace = "Mips64";
+}
 
 def MOVN_I_S   : CondMovIntFP<CPURegs, FGR32, 16, 19, "movn.s">;
 def MOVN_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 19, "movn.s">,
-                 Requires<[HasMips64]>;
+                 Requires<[HasMips64]> {
+  let DecoderNamespace = "Mips64";
+}
 
 let Predicates = [NotFP64bit] in {
   def MOVZ_I_D32   : CondMovIntFP<CPURegs, AFGR64, 17, 18, "movz.d">;
   def MOVN_I_D32   : CondMovIntFP<CPURegs, AFGR64, 17, 19, "movn.d">;
 }
-let Predicates = [IsFP64bit] in {
+let Predicates = [IsFP64bit],DecoderNamespace = "Mips64" in {
   def MOVZ_I_D64   : CondMovIntFP<CPURegs, FGR64, 17, 18, "movz.d">;
-  def MOVZ_I64_D64 : CondMovIntFP<CPU64Regs, FGR64, 17, 18, "movz.d">;
+  def MOVZ_I64_D64 : CondMovIntFP<CPU64Regs, FGR64, 17, 18, "movz.d"> {
+    let isCodeGenOnly = 1;
+  }
   def MOVN_I_D64   : CondMovIntFP<CPURegs, FGR64, 17, 19, "movn.d">;
-  def MOVN_I64_D64 : CondMovIntFP<CPU64Regs, FGR64, 17, 19, "movn.d">;
+  def MOVN_I64_D64 : CondMovIntFP<CPU64Regs, FGR64, 17, 19, "movn.d"> {
+    let isCodeGenOnly = 1;
+  }
 }
 
 def MOVT_I   : CondMovFPInt<CPURegs, MipsCMovFP_T, 1, "movt">;
 def MOVT_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_T, 1, "movt">,
-               Requires<[HasMips64]>;
+               Requires<[HasMips64]> {
+  let DecoderNamespace = "Mips64";
+}
 
 def MOVF_I   : CondMovFPInt<CPURegs, MipsCMovFP_F, 0, "movf">;
 def MOVF_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_F, 0, "movf">,
-               Requires<[HasMips64]>;
+               Requires<[HasMips64]> {
+  let DecoderNamespace = "Mips64";
+}
 
 def MOVT_S : CondMovFPFP<FGR32, MipsCMovFP_T, 16, 1, "movt.s">;
 def MOVF_S : CondMovFPFP<FGR32, MipsCMovFP_F, 16, 0, "movf.s">;
@@ -142,7 +162,7 @@ let Predicates = [NotFP64bit] in {
   def MOVT_D32 : CondMovFPFP<AFGR64, MipsCMovFP_T, 17, 1, "movt.d">;
   def MOVF_D32 : CondMovFPFP<AFGR64, MipsCMovFP_F, 17, 0, "movf.d">;
 }
-let Predicates = [IsFP64bit] in {
+let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
   def MOVT_D64 : CondMovFPFP<FGR64, MipsCMovFP_T, 17, 1, "movt.d">;
   def MOVF_D64 : CondMovFPFP<FGR64, MipsCMovFP_F, 17, 0, "movf.d">;
 }
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index ebfbb4a..f8ea3d0 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -108,9 +108,6 @@ static void expandLargeImm(unsigned Reg, int64_t Imm, bool IsN64,
     AnalyzeImm.Analyze(Imm, IsN64 ? 64 : 32, false /* LastInstrIsADDiu */);
   MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
 
-  // FIXME: change this when mips goes MC".
-  BuildMI(MBB, II, DL, TII.get(Mips::NOAT));
-
   // The first instruction can be a LUi, which is different from other
   // instructions (ADDiu, ORI and SLL) in that it does not have a register
   // operand.
@@ -127,7 +124,6 @@ static void expandLargeImm(unsigned Reg, int64_t Imm, bool IsN64,
       .addImm(SignExtend64<16>(Inst->ImmOpnd));
 
   BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(Reg).addReg(ATReg);
-  BuildMI(MBB, II, DL, TII.get(Mips::ATMACRO));
 }
 
 void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
@@ -159,18 +155,22 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
    // Update stack size
   MFI->setStackSize(StackSize);
 
-  BuildMI(MBB, MBBI, dl, TII.get(Mips::NOREORDER));
-  BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
-
   // Emit instructions that set the global base register if the target ABI is
   // O32.
-  if (isPIC && MipsFI->globalBaseRegSet() && STI.isABI_O32()) {
-    if (MipsFI->globalBaseRegFixed())
-      BuildMI(MBB, llvm::prior(MBBI), dl, TII.get(Mips::CPLOAD))
-        .addReg(RegInfo->getPICCallReg());
-    else
+  if (isPIC && MipsFI->globalBaseRegSet() && STI.isABI_O32() &&
+      !MipsFI->globalBaseRegFixed()) {
       // See MipsInstrInfo.td for explanation.
-      BuildMI(MBB, MBBI, dl, TII.get(Mips:: SETGP01), Mips::V0);
+    MachineBasicBlock *NewEntry = MF.CreateMachineBasicBlock();
+    MF.insert(&MBB, NewEntry);
+    NewEntry->addSuccessor(&MBB);
+
+    // Copy live in registers.
+    for (MachineBasicBlock::livein_iterator R = MBB.livein_begin();
+         R != MBB.livein_end(); ++R)
+      NewEntry->addLiveIn(*R);
+
+    BuildMI(*NewEntry, NewEntry->begin(), dl, TII.get(Mips:: SETGP01),
+            Mips::V0);
   }
 
   // No need to allocate space on the stack.
@@ -183,8 +183,10 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   // Adjust stack.
   if (isInt<16>(-StackSize)) // addi sp, sp, (-stacksize)
     BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(SP).addImm(-StackSize);
-  else // Expand immediate that doesn't fit in 16-bit.
+  else { // Expand immediate that doesn't fit in 16-bit.
+    MipsFI->setEmitNOAT();
     expandLargeImm(SP, -StackSize, STI.isABI_N64(), TII, MBB, MBBI, dl);
+  }
 
   // emit ".cfi_def_cfa_offset StackSize"
   MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
@@ -254,12 +256,8 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   // Restore GP from the saved stack location
   if (MipsFI->needGPSaveRestore()) {
     unsigned Offset = MFI->getObjectOffset(MipsFI->getGPFI());
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE)).addImm(Offset);
-
-    if (Offset >= 0x8000) {
-      BuildMI(MBB, llvm::prior(MBBI), dl, TII.get(Mips::MACRO));
-      BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
-    }
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE)).addImm(Offset)
+      .addReg(Mips::GP);
   }
 }
 
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 536879e..f0651c6 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -142,6 +142,7 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
 
   if (Subtarget.isABI_N64()) {
     MF.getRegInfo().addLiveIn(Mips::T9_64);
+    MBB.addLiveIn(Mips::T9_64);
 
     // lui $v0, %hi(%neg(%gp_rel(fname)))
     // daddu $v1, $v0, $t9
@@ -163,6 +164,7 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
       .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
   } else {
     MF.getRegInfo().addLiveIn(Mips::T9);
+    MBB.addLiveIn(Mips::T9);
 
     if (Subtarget.isABI_N32()) {
       // lui $v0, %hi(%neg(%gp_rel(fname)))
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index ecde5b6..6a23bc3 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -147,6 +147,11 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
 
+  if (!TM.Options.NoNaNsFPMath) {
+    setOperationAction(ISD::FABS,             MVT::f32,   Custom);
+    setOperationAction(ISD::FABS,             MVT::f64,   Custom);
+  }
+
   if (HasMips64) {
     setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
     setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
@@ -208,6 +213,13 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FEXP,              MVT::f32,   Expand);
   setOperationAction(ISD::FMA,               MVT::f32,   Expand);
   setOperationAction(ISD::FMA,               MVT::f64,   Expand);
+  setOperationAction(ISD::FREM,              MVT::f32,   Expand);
+  setOperationAction(ISD::FREM,              MVT::f64,   Expand);
+
+  if (!TM.Options.NoNaNsFPMath) {
+    setOperationAction(ISD::FNEG,             MVT::f32,   Expand);
+    setOperationAction(ISD::FNEG,             MVT::f64,   Expand);
+  }
 
   setOperationAction(ISD::EXCEPTIONADDR,     MVT::i32, Expand);
   setOperationAction(ISD::EXCEPTIONADDR,     MVT::i64, Expand);
@@ -732,6 +744,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::SETCC:              return LowerSETCC(Op, DAG);
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
     case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+    case ISD::FABS:               return LowerFABS(Op, DAG);
     case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
     case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
     case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
@@ -1541,7 +1554,7 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
   EVT ValTy = Op.getValueType();
   bool HasGotOfst = (GV->hasInternalLinkage() ||
                      (GV->hasLocalLinkage() && !isa<Function>(GV)));
-  unsigned GotFlag = IsN64 ?
+  unsigned GotFlag = HasMips64 ?
                      (HasGotOfst ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT_DISP) :
                      (HasGotOfst ? MipsII::MO_GOT : MipsII::MO_GOT16);
   SDValue GA = DAG.getTargetGlobalAddress(GV, dl, ValTy, 0, GotFlag);
@@ -1553,8 +1566,8 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
   if (!HasGotOfst)
     return ResNode;
   SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, ValTy, 0,
-                                            IsN64 ? MipsII::MO_GOT_OFST :
-                                                    MipsII::MO_ABS_LO);
+                                            HasMips64 ? MipsII::MO_GOT_OFST :
+                                                        MipsII::MO_ABS_LO);
   SDValue Lo = DAG.getNode(MipsISD::Lo, dl, ValTy, GALo);
   return DAG.getNode(ISD::ADD, dl, ValTy, ResNode, Lo);
 }
@@ -1575,8 +1588,8 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
   }
 
   EVT ValTy = Op.getValueType();
-  unsigned GOTFlag = IsN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
-  unsigned OFSTFlag = IsN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+  unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+  unsigned OFSTFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
   SDValue BAGOTOffset = DAG.getBlockAddress(BA, ValTy, true, GOTFlag);
   BAGOTOffset = DAG.getNode(MipsISD::Wrapper, dl, ValTy,
                             GetGlobalReg(DAG, ValTy), BAGOTOffset);
@@ -1679,8 +1692,8 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
     HiPart = DAG.getNode(MipsISD::Hi, dl, PtrVT, JTI);
     JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MipsII::MO_ABS_LO);
   } else {// Emit Load from Global Pointer
-    unsigned GOTFlag = IsN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
-    unsigned OfstFlag = IsN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+    unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+    unsigned OfstFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
     JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, GOTFlag);
     JTI = DAG.getNode(MipsISD::Wrapper, dl, PtrVT, GetGlobalReg(DAG, PtrVT),
                       JTI);
@@ -1712,7 +1725,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
   //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode);
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     SDValue CPHi = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
                                              N->getOffset(), MipsII::MO_ABS_HI);
     SDValue CPLo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
@@ -1722,8 +1735,8 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
     ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
   } else {
     EVT ValTy = Op.getValueType();
-    unsigned GOTFlag = IsN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
-    unsigned OFSTFlag = IsN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+    unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+    unsigned OFSTFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
     SDValue CP = DAG.getTargetConstantPool(C, ValTy, N->getAlignment(),
                                            N->getOffset(), GOTFlag);
     CP = DAG.getNode(MipsISD::Wrapper, dl, ValTy, GetGlobalReg(DAG, ValTy), CP);
@@ -1754,66 +1767,162 @@ SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV), false, false, 0);
 }
 
-// Called if the size of integer registers is large enough to hold the whole
-// floating point number.
-static SDValue LowerFCOPYSIGNLargeIntReg(SDValue Op, SelectionDAG &DAG) {
-  // FIXME: Use ext/ins instructions if target architecture is Mips32r2.
-  EVT ValTy = Op.getValueType();
-  EVT IntValTy = MVT::getIntegerVT(ValTy.getSizeInBits());
-  uint64_t Mask = (uint64_t)1 << (ValTy.getSizeInBits() - 1);
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntValTy, Op.getOperand(0));
-  SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntValTy, Op.getOperand(1));
-  SDValue And0 = DAG.getNode(ISD::AND, dl, IntValTy, Op0,
-                             DAG.getConstant(Mask - 1, IntValTy));
-  SDValue And1 = DAG.getNode(ISD::AND, dl, IntValTy, Op1,
-                             DAG.getConstant(Mask, IntValTy));
-  SDValue Result = DAG.getNode(ISD::OR, dl, IntValTy, And0, And1);
-  return DAG.getNode(ISD::BITCAST, dl, ValTy, Result);
-}
-
-// Called if the size of integer registers is not large enough to hold the whole
-// floating point number (e.g. f64 & 32-bit integer register).
-static SDValue
-LowerFCOPYSIGNSmallIntReg(SDValue Op, SelectionDAG &DAG, bool isLittle) {
-  // FIXME:
-  //  Use ext/ins instructions if target architecture is Mips32r2.
-  //  Eliminate redundant mfc1 and mtc1 instructions.
-  unsigned LoIdx = 0, HiIdx = 1;
-
-  if (!isLittle)
-    std::swap(LoIdx, HiIdx);
+static SDValue LowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+  EVT TyX = Op.getOperand(0).getValueType();
+  EVT TyY = Op.getOperand(1).getValueType();
+  SDValue Const1 = DAG.getConstant(1, MVT::i32);
+  SDValue Const31 = DAG.getConstant(31, MVT::i32);
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Res;
+
+  // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
+  // to i32.
+  SDValue X = (TyX == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+                Const1);
+  SDValue Y = (TyY == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(1)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(1),
+                Const1);
+
+  if (HasR2) {
+    // ext  E, Y, 31, 1  ; extract bit31 of Y
+    // ins  X, E, 31, 1  ; insert extracted bit at bit31 of X
+    SDValue E = DAG.getNode(MipsISD::Ext, DL, MVT::i32, Y, Const31, Const1);
+    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32, E, Const31, Const1, X);
+  } else {
+    // sll SllX, X, 1
+    // srl SrlX, SllX, 1
+    // srl SrlY, Y, 31
+    // sll SllY, SrlX, 31
+    // or  Or, SrlX, SllY
+    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
+    SDValue SrlX = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
+    SDValue SrlY = DAG.getNode(ISD::SRL, DL, MVT::i32, Y, Const31);
+    SDValue SllY = DAG.getNode(ISD::SHL, DL, MVT::i32, SrlY, Const31);
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, SrlX, SllY);
+  }
 
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue Word0 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
-                              Op.getOperand(0),
-                              DAG.getConstant(LoIdx, MVT::i32));
-  SDValue Hi0 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
-                            Op.getOperand(0), DAG.getConstant(HiIdx, MVT::i32));
-  SDValue Hi1 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
-                            Op.getOperand(1), DAG.getConstant(HiIdx, MVT::i32));
-  SDValue And0 = DAG.getNode(ISD::AND, dl, MVT::i32, Hi0,
-                             DAG.getConstant(0x7fffffff, MVT::i32));
-  SDValue And1 = DAG.getNode(ISD::AND, dl, MVT::i32, Hi1,
-                             DAG.getConstant(0x80000000, MVT::i32));
-  SDValue Word1 = DAG.getNode(ISD::OR, dl, MVT::i32, And0, And1);
+  if (TyX == MVT::f32)
+    return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Res);
 
-  if (!isLittle)
-    std::swap(Word0, Word1);
+  SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                             Op.getOperand(0), DAG.getConstant(0, MVT::i32));
+  return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
+}
 
-  return DAG.getNode(MipsISD::BuildPairF64, dl, MVT::f64, Word0, Word1);
+static SDValue LowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+  unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
+  unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
+  EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
+  SDValue Const1 = DAG.getConstant(1, MVT::i32);
+  DebugLoc DL = Op.getDebugLoc();
+
+  // Bitcast to integer nodes.
+  SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
+  SDValue Y = DAG.getNode(ISD::BITCAST, DL, TyY, Op.getOperand(1));
+
+  if (HasR2) {
+    // ext  E, Y, width(Y) - 1, 1  ; extract bit width(Y)-1 of Y
+    // ins  X, E, width(X) - 1, 1  ; insert extracted bit at bit width(X)-1 of X
+    SDValue E = DAG.getNode(MipsISD::Ext, DL, TyY, Y,
+                            DAG.getConstant(WidthY - 1, MVT::i32), Const1);
+
+    if (WidthX > WidthY)
+      E = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, E);
+    else if (WidthY > WidthX)
+      E = DAG.getNode(ISD::TRUNCATE, DL, TyX, E);
+
+    SDValue I = DAG.getNode(MipsISD::Ins, DL, TyX, E,
+                            DAG.getConstant(WidthX - 1, MVT::i32), Const1, X);
+    return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), I);
+  }
+
+  // (d)sll SllX, X, 1
+  // (d)srl SrlX, SllX, 1
+  // (d)srl SrlY, Y, width(Y)-1
+  // (d)sll SllY, SrlX, width(Y)-1
+  // or     Or, SrlX, SllY
+  SDValue SllX = DAG.getNode(ISD::SHL, DL, TyX, X, Const1);
+  SDValue SrlX = DAG.getNode(ISD::SRL, DL, TyX, SllX, Const1);
+  SDValue SrlY = DAG.getNode(ISD::SRL, DL, TyY, Y,
+                             DAG.getConstant(WidthY - 1, MVT::i32));
+
+  if (WidthX > WidthY)
+    SrlY = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, SrlY);
+  else if (WidthY > WidthX)
+    SrlY = DAG.getNode(ISD::TRUNCATE, DL, TyX, SrlY);
+
+  SDValue SllY = DAG.getNode(ISD::SHL, DL, TyX, SrlY,
+                             DAG.getConstant(WidthX - 1, MVT::i32));
+  SDValue Or = DAG.getNode(ISD::OR, DL, TyX, SrlX, SllY);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Or);
 }
 
 SDValue
 MipsTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
-  EVT Ty = Op.getValueType();
+  if (Subtarget->hasMips64())
+    return LowerFCOPYSIGN64(Op, DAG, Subtarget->hasMips32r2());
 
-  assert(Ty == MVT::f32 || Ty == MVT::f64);
+  return LowerFCOPYSIGN32(Op, DAG, Subtarget->hasMips32r2());
+}
 
-  if (Ty == MVT::f32 || HasMips64)
-    return LowerFCOPYSIGNLargeIntReg(Op, DAG);
+static SDValue LowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+  SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
+  DebugLoc DL = Op.getDebugLoc();
+
+  // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
+  // to i32.
+  SDValue X = (Op.getValueType() == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+                Const1);
+
+  // Clear MSB.
+  if (HasR2)
+    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32,
+                      DAG.getRegister(Mips::ZERO, MVT::i32),
+                      DAG.getConstant(31, MVT::i32), Const1, X);
+  else {
+    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
+    Res = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
+  }
+
+  if (Op.getValueType() == MVT::f32)
+    return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Res);
+
+  SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                             Op.getOperand(0), DAG.getConstant(0, MVT::i32));
+  return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
+}
+
+static SDValue LowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+  SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
+  DebugLoc DL = Op.getDebugLoc();
+
+  // Bitcast to integer node.
+  SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
+
+  // Clear MSB.
+  if (HasR2)
+    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i64,
+                      DAG.getRegister(Mips::ZERO_64, MVT::i64),
+                      DAG.getConstant(63, MVT::i32), Const1, X);
+  else {
+    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i64, X, Const1);
+    Res = DAG.getNode(ISD::SRL, DL, MVT::i64, SllX, Const1);
+  }
 
-  return LowerFCOPYSIGNSmallIntReg(Op, DAG, Subtarget->isLittle());
+  return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Res);
+}
+
+SDValue
+MipsTargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
+  if (Subtarget->hasMips64() && (Op.getValueType() == MVT::f64))
+    return LowerFABS64(Op, DAG, Subtarget->hasMips32r2());
+
+  return LowerFABS32(Op, DAG, Subtarget->hasMips32r2());
 }
 
 SDValue MipsTargetLowering::
@@ -2545,7 +2654,8 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
                          std::vector<SDValue>& OutChains,
                          SelectionDAG &DAG, unsigned NumWords, SDValue FIN,
-                         const CCValAssign &VA, const ISD::ArgFlagsTy& Flags) {
+                         const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+                         const Argument *FuncArg) {
   unsigned LocMem = VA.getLocMemOffset();
   unsigned FirstWord = LocMem / 4;
 
@@ -2560,8 +2670,8 @@ static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
     SDValue StorePtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIN,
                                    DAG.getConstant(i * 4, MVT::i32));
     SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(Reg, MVT::i32),
-                                 StorePtr, MachinePointerInfo(), false,
-                                 false, 0);
+                                 StorePtr, MachinePointerInfo(FuncArg, i * 4),
+                                 false, false, 0);
     OutChains.push_back(Store);
   }
 }
@@ -2573,7 +2683,7 @@ CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl,
                     const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
                     MachineFrameInfo *MFI, bool IsRegLoc,
                     SmallVectorImpl<SDValue> &InVals, MipsFunctionInfo *MipsFI,
-                    EVT PtrTy) {
+                    EVT PtrTy, const Argument *FuncArg) {
   const uint16_t *Reg = Mips64IntRegs + 8;
   int FOOffset; // Frame object offset from virtual frame pointer.
 
@@ -2597,8 +2707,8 @@ CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl,
     SDValue StorePtr = DAG.getNode(ISD::ADD, dl, PtrTy, FIN,
                                    DAG.getConstant(I * 8, PtrTy));
     SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(VReg, MVT::i64),
-                                 StorePtr, MachinePointerInfo(), false,
-                                 false, 0);
+                                 StorePtr, MachinePointerInfo(FuncArg, I * 8),
+                                 false, false, 0);
     OutChains.push_back(Store);
   }
 
@@ -2634,9 +2744,11 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   else
     CCInfo.AnalyzeFormalArguments(Ins, CC_Mips);
 
+  Function::const_arg_iterator FuncArg =
+    DAG.getMachineFunction().getFunction()->arg_begin();
   int LastFI = 0;// MipsFI->LastInArgFI is 0 at the entry of this function.
 
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++FuncArg) {
     CCValAssign &VA = ArgLocs[i];
     EVT ValVT = VA.getValVT();
     ISD::ArgFlagsTy Flags = Ins[i].Flags;
@@ -2651,11 +2763,12 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
                                         true);
         SDValue FIN = DAG.getFrameIndex(LastFI, getPointerTy());
         InVals.push_back(FIN);
-        ReadByValArg(MF, Chain, dl, OutChains, DAG, NumWords, FIN, VA, Flags);
+        ReadByValArg(MF, Chain, dl, OutChains, DAG, NumWords, FIN, VA, Flags,
+                     &*FuncArg);
       } else // N32/64
         LastFI = CopyMips64ByValRegs(MF, Chain, dl, OutChains, DAG, VA, Flags,
                                      MFI, IsRegLoc, InVals, MipsFI,
-                                     getPointerTy());
+                                     getPointerTy(), &*FuncArg);
       continue;
     }
 
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 66f45cd..c36f40f 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -131,6 +131,7 @@ namespace llvm {
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index fe5eaec..14d8f1e 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -47,17 +47,17 @@ def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
                                    SDT_MipsExtractElementF64>;
 
 // Operand for printing out a condition code.
-let PrintMethod = "printFCCOperand" in
+let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in
   def condcode : Operand<i32>;
 
 //===----------------------------------------------------------------------===//
 // Feature predicates.
 //===----------------------------------------------------------------------===//
 
-def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">;
-def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">;
-def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">;
-def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">;
+def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">, AssemblerPredicate<"FeatureFP64Bit">;
+def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">, AssemblerPredicate<"!FeatureFP64Bit">;
+def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">, AssemblerPredicate<"FeatureSingleFloat">;
+def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">, AssemblerPredicate<"!FeatureSingleFloat">;
 
 // FP immediate patterns.
 def fpimm0 : PatLeaf<(fpimm), [{
@@ -83,6 +83,7 @@ def fpimm0neg : PatLeaf<(fpimm), [{
 //===----------------------------------------------------------------------===//
 
 // FP load.
+let DecoderMethod = "DecodeFMem" in {
 class FPLoad<bits<6> op, string opstr, RegisterClass RC, Operand MemOpnd>:
   FMem<op, (outs RC:$ft), (ins MemOpnd:$addr),
       !strconcat(opstr, "\t$ft, $addr"), [(set RC:$ft, (load_a addr:$addr))],
@@ -93,7 +94,7 @@ class FPStore<bits<6> op, string opstr, RegisterClass RC, Operand MemOpnd>:
   FMem<op, (outs), (ins RC:$ft, MemOpnd:$addr),
       !strconcat(opstr, "\t$ft, $addr"), [(store_a RC:$ft, addr:$addr)],
       IIStore>;
-
+}
 // FP indexed load.
 class FPIdxLoad<bits<6> funct, string opstr, RegisterClass DRC,
                 RegisterClass PRC, PatFrag FOp>:
@@ -118,11 +119,13 @@ multiclass FFR1_W_M<bits<6> funct, string opstr> {
   def _D32 : FFR1<funct, 17, opstr, "w.d", FGR32, AFGR64>,
              Requires<[NotFP64bit]>;
   def _D64 : FFR1<funct, 17, opstr, "w.d", FGR32, FGR64>,
-             Requires<[IsFP64bit]>;
+             Requires<[IsFP64bit]> {
+    let DecoderNamespace = "Mips64";
+  }
 }
 
 // Instructions that convert an FP value to 64-bit fixed point.
-let Predicates = [IsFP64bit] in
+let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in
 multiclass FFR1_L_M<bits<6> funct, string opstr> {
   def _S   : FFR1<funct, 16, opstr, "l.s", FGR64, FGR32>;
   def _D64 : FFR1<funct, 17, opstr, "l.d", FGR64, FGR64>;
@@ -134,7 +137,9 @@ multiclass FFR1P_M<bits<6> funct, string opstr, SDNode OpNode> {
   def _D32 : FFR1P<funct, 17, opstr, "d", AFGR64, AFGR64, OpNode>,
              Requires<[NotFP64bit]>;
   def _D64 : FFR1P<funct, 17, opstr, "d", FGR64, FGR64, OpNode>,
-             Requires<[IsFP64bit]>;
+             Requires<[IsFP64bit]> {
+    let DecoderNamespace = "Mips64";
+  }
 }
 
 multiclass FFR2P_M<bits<6> funct, string opstr, SDNode OpNode, bit isComm = 0> {
@@ -143,9 +148,11 @@ multiclass FFR2P_M<bits<6> funct, string opstr, SDNode OpNode, bit isComm = 0> {
   def _D32 : FFR2P<funct, 17, opstr, "d", AFGR64, OpNode>,
              Requires<[NotFP64bit]>;
   def _D64 : FFR2P<funct, 17, opstr, "d", FGR64, OpNode>,
-             Requires<[IsFP64bit]>;
+             Requires<[IsFP64bit]> {
+    let DecoderNamespace = "Mips64";
   }
 }
+}
 
 // FP madd/msub/nmadd/nmsub instruction classes.
 class FMADDSUB<bits<3> funct, bits<3> fmt, string opstr, string fmtstr,
@@ -172,9 +179,11 @@ defm CEIL_L  : FFR1_L_M<0xa, "ceil">;
 defm FLOOR_W : FFR1_W_M<0xf, "floor">;
 defm FLOOR_L : FFR1_L_M<0xb, "floor">;
 defm CVT_W   : FFR1_W_M<0x24, "cvt">;
-defm CVT_L   : FFR1_L_M<0x25, "cvt">;
+//defm CVT_L   : FFR1_L_M<0x25, "cvt">;
 
 def CVT_S_W : FFR1<0x20, 20, "cvt", "s.w", FGR32, FGR32>;
+def CVT_L_S : FFR1<0x25, 16, "cvt", "l.s", FGR64, FGR32>;
+def CVT_L_D64: FFR1<0x25, 17, "cvt", "l.d", FGR64, FGR64>;
 
 let Predicates = [NotFP64bit] in {
   def CVT_S_D32 : FFR1<0x20, 17, "cvt", "s.d", FGR32, AFGR64>;
@@ -182,7 +191,7 @@ let Predicates = [NotFP64bit] in {
   def CVT_D32_S : FFR1<0x21, 16, "cvt", "d.s", AFGR64, FGR32>;
 }
 
-let Predicates = [IsFP64bit] in {
+let Predicates = [IsFP64bit], DecoderNamespace = "Mips64" in {
  def CVT_S_D64 : FFR1<0x20, 17, "cvt", "s.d", FGR32, FGR64>;
  def CVT_S_L   : FFR1<0x20, 21, "cvt", "s.l", FGR32, FGR64>;
  def CVT_D64_W : FFR1<0x21, 20, "cvt", "d.w", FGR64, FGR32>;
@@ -190,8 +199,10 @@ let Predicates = [IsFP64bit] in {
  def CVT_D64_L : FFR1<0x21, 21, "cvt", "d.l", FGR64, FGR64>;
 }
 
-defm FABS    : FFR1P_M<0x5, "abs",  fabs>;
-defm FNEG    : FFR1P_M<0x7, "neg",  fneg>;
+let Predicates = [NoNaNsFPMath] in {
+  defm FABS    : FFR1P_M<0x5, "abs",  fabs>;
+  defm FNEG    : FFR1P_M<0x7, "neg",  fneg>;
+}
 defm FSQRT   : FFR1P_M<0x4, "sqrt", fsqrt>;
 
 // The odd-numbered registers are only referenced when doing loads,
@@ -233,14 +244,20 @@ def FMOV_S   : FFR1<0x6, 16, "mov", "s", FGR32, FGR32>;
 def FMOV_D32 : FFR1<0x6, 17, "mov", "d", AFGR64, AFGR64>,
                Requires<[NotFP64bit]>;
 def FMOV_D64 : FFR1<0x6, 17, "mov", "d", FGR64, FGR64>,
-               Requires<[IsFP64bit]>;
+               Requires<[IsFP64bit]> {
+  let DecoderNamespace = "Mips64";
+}
 
 /// Floating Point Memory Instructions
-let Predicates = [IsN64] in {
+let Predicates = [IsN64], DecoderNamespace = "Mips64" in {
   def LWC1_P8   : FPLoad<0x31, "lwc1", FGR32, mem64>;
   def SWC1_P8   : FPStore<0x39, "swc1", FGR32, mem64>;
-  def LDC164_P8 : FPLoad<0x35, "ldc1", FGR64, mem64>;
-  def SDC164_P8 : FPStore<0x3d, "sdc1", FGR64, mem64>;
+  def LDC164_P8 : FPLoad<0x35, "ldc1", FGR64, mem64> {
+    let isCodeGenOnly =1;
+  }
+  def SDC164_P8 : FPStore<0x3d, "sdc1", FGR64, mem64> {
+    let isCodeGenOnly =1;
+  }
 }
 
 let Predicates = [NotN64] in {
@@ -248,7 +265,7 @@ let Predicates = [NotN64] in {
   def SWC1   : FPStore<0x39, "swc1", FGR32, mem>;
 }
 
-let Predicates = [NotN64, HasMips64] in {
+let Predicates = [NotN64, HasMips64], DecoderNamespace = "Mips64" in {
   def LDC164 : FPLoad<0x35, "ldc1", FGR64, mem>;
   def SDC164 : FPStore<0x3d, "sdc1", FGR64, mem>;
 }
@@ -271,13 +288,13 @@ let Predicates = [HasMips32r2, NotMips64] in {
   def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store_a>;
 }
 
-let Predicates = [HasMips64, NotN64] in {
+let Predicates = [HasMips64, NotN64], DecoderNamespace="Mips64" in {
   def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load_a>;
   def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store_a>;
 }
 
 // n64
-let Predicates = [IsN64] in {
+let Predicates = [IsN64], isCodeGenOnly=1 in {
   def LWXC1_P8   : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load_a>;
   def LUXC1_P8   : FPIdxLoad<0x5, "luxc1", FGR32, CPU64Regs, load_u>;
   def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load_a>;
@@ -312,12 +329,12 @@ let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath] in {
   def NMSUB_D32 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, AFGR64>;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit] in {
+let Predicates = [HasMips32r2, IsFP64bit], isCodeGenOnly=1 in {
   def MADD_D64 : FMADDSUB<0x4, 1, "madd", "d", fadd, FGR64>;
   def MSUB_D64 : FMADDSUB<0x5, 1, "msub", "d", fsub, FGR64>;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath] in {
+let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath], isCodeGenOnly=1 in {
   def NMADD_D64 : FNMADDSUB<0x6, 1, "nmadd", "d", fadd, FGR64>;
   def NMSUB_D64 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, FGR64>;
 }
@@ -340,9 +357,10 @@ let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in
   let Inst{16} = tf;
 }
 
+let DecoderMethod = "DecodeBC1" in {
 def BC1F  : FBRANCH<0, 0, MIPS_BRANCH_F,  "bc1f">;
 def BC1T  : FBRANCH<0, 1, MIPS_BRANCH_T,  "bc1t">;
-
+}
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
 //===----------------------------------------------------------------------===//
@@ -374,7 +392,9 @@ class FCMP<bits<5> fmt, RegisterClass RC, string typestr> :
 let Defs=[FCR31] in {
   def FCMP_S32 : FCMP<0x10, FGR32, "s">;
   def FCMP_D32 : FCMP<0x11, AFGR64, "d">, Requires<[NotFP64bit]>;
-  def FCMP_D64 : FCMP<0x11, FGR64, "d">, Requires<[IsFP64bit]>;
+  def FCMP_D64 : FCMP<0x11, FGR64, "d">, Requires<[IsFP64bit]> {
+    let DecoderNamespace = "Mips64";
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -436,13 +456,13 @@ let Predicates = [IsFP64bit] in {
 
 // Patterns for unaligned floating point loads and stores.
 let Predicates = [HasMips32r2Or64, NotN64] in {
-  def : Pat<(f32 (load_u CPURegs:$addr)), (LUXC1 CPURegs:$addr, ZERO)>; 
+  def : Pat<(f32 (load_u CPURegs:$addr)), (LUXC1 CPURegs:$addr, ZERO)>;
   def : Pat<(store_u FGR32:$src, CPURegs:$addr),
             (SUXC1 FGR32:$src, CPURegs:$addr, ZERO)>;
 }
 
 let Predicates = [IsN64] in {
-  def : Pat<(f32 (load_u CPU64Regs:$addr)), (LUXC1_P8 CPU64Regs:$addr, ZERO_64)>; 
+  def : Pat<(f32 (load_u CPU64Regs:$addr)), (LUXC1_P8 CPU64Regs:$addr, ZERO_64)>;
   def : Pat<(store_u FGR32:$src, CPU64Regs:$addr),
             (SUXC1_P8 FGR32:$src, CPU64Regs:$addr, ZERO_64)>;
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 4555303..841eba0 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -45,6 +45,8 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   let Namespace = "Mips";
 
+  let Size = 4;
+
   bits<6> Opcode = 0;
 
   // Top 6 bits are the 'opcode' field
@@ -64,6 +66,10 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   // TSFlags layout should be kept in sync with MipsInstrInfo.h.
   let TSFlags{3-0}   = FormBits;
+
+  let DecoderNamespace = "Mips";
+
+  field bits<32> SoftFail = 0;
 }
 
 // Mips Pseudo Instructions Format
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index bc85fa6..873d2bd 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -121,21 +121,36 @@ def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasSEInReg  : Predicate<"Subtarget.hasSEInReg()">;
-def HasBitCount : Predicate<"Subtarget.hasBitCount()">;
-def HasSwap     : Predicate<"Subtarget.hasSwap()">;
-def HasCondMov  : Predicate<"Subtarget.hasCondMov()">;
-def HasMips32    : Predicate<"Subtarget.hasMips32()">;
-def HasMips32r2  : Predicate<"Subtarget.hasMips32r2()">;
-def HasMips64    : Predicate<"Subtarget.hasMips64()">;
-def HasMips32r2Or64 : Predicate<"Subtarget.hasMips32r2Or64()">;
-def NotMips64    : Predicate<"!Subtarget.hasMips64()">;
-def HasMips64r2  : Predicate<"Subtarget.hasMips64r2()">;
-def IsN64       : Predicate<"Subtarget.isABI_N64()">;
-def NotN64      : Predicate<"!Subtarget.isABI_N64()">;
-def RelocStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
-def RelocPIC    : Predicate<"TM.getRelocationModel() == Reloc::PIC_">;
-def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
+def HasSEInReg  :     Predicate<"Subtarget.hasSEInReg()">,
+                      AssemblerPredicate<"FeatureSEInReg">;
+def HasBitCount :     Predicate<"Subtarget.hasBitCount()">,
+                      AssemblerPredicate<"FeatureBitCount">;
+def HasSwap     :     Predicate<"Subtarget.hasSwap()">,
+                      AssemblerPredicate<"FeatureSwap">;
+def HasCondMov  :     Predicate<"Subtarget.hasCondMov()">,
+                      AssemblerPredicate<"FeatureCondMov">;
+def HasMips32    :    Predicate<"Subtarget.hasMips32()">,
+                      AssemblerPredicate<"FeatureMips32">;
+def HasMips32r2  :    Predicate<"Subtarget.hasMips32r2()">,
+                      AssemblerPredicate<"FeatureMips32r2">;
+def HasMips64    :    Predicate<"Subtarget.hasMips64()">,
+                      AssemblerPredicate<"FeatureMips64">;
+def HasMips32r2Or64 : Predicate<"Subtarget.hasMips32r2Or64()">,
+                      AssemblerPredicate<"FeatureMips32r2,FeatureMips64">;
+def NotMips64    :    Predicate<"!Subtarget.hasMips64()">,
+                      AssemblerPredicate<"!FeatureMips64">;
+def HasMips64r2  :    Predicate<"Subtarget.hasMips64r2()">,
+                      AssemblerPredicate<"FeatureMips64r2">;
+def IsN64       :     Predicate<"Subtarget.isABI_N64()">,
+                      AssemblerPredicate<"FeatureN64">;
+def NotN64      :     Predicate<"!Subtarget.isABI_N64()">,
+                      AssemblerPredicate<"!FeatureN64">;
+def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">,
+                      AssemblerPredicate<"FeatureMips32">;
+def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">,
+                      AssemblerPredicate<"FeatureMips32">;
+def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
+                      AssemblerPredicate<"FeatureMips32">;
 
 //===----------------------------------------------------------------------===//
 // Mips Operand, Complex Patterns and Transformations Definitions.
@@ -148,12 +163,15 @@ def jmptarget   : Operand<OtherVT> {
 def brtarget    : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget";
 }
 def calltarget  : Operand<iPTR> {
   let EncoderMethod = "getJumpTargetOpValue";
 }
 def calltarget64: Operand<i64>;
-def simm16      : Operand<i32>;
+def simm16      : Operand<i32> {
+  let DecoderMethod= "DecodeSimm16";
+}
 def simm16_64   : Operand<i64>;
 def shamt       : Operand<i32>;
 
@@ -189,11 +207,13 @@ def mem_ea_64 : Operand<i64> {
 // size operand of ext instruction
 def size_ext : Operand<i32> {
   let EncoderMethod = "getSizeExtEncoding";
+  let DecoderMethod = "DecodeExtSize";
 }
 
 // size operand of ins instruction
 def size_ins : Operand<i32> {
   let EncoderMethod = "getSizeInsEncoding";
+  let DecoderMethod = "DecodeInsSize";
 }
 
 // Transformation Function - get the lower 16 bits.
@@ -295,6 +315,7 @@ class ArithLogicR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
      [(set RC:$rd, (OpNode RC:$rs, RC:$rt))], itin> {
   let shamt = 0;
   let isCommutable = isComm;
+  let isReMaterializable = 1;
 }
 
 class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm,
@@ -310,7 +331,9 @@ class ArithLogicI<bits<6> op, string instr_asm, SDNode OpNode,
                   Operand Od, PatLeaf imm_type, RegisterClass RC> :
   FI<op, (outs RC:$rt), (ins RC:$rs, Od:$imm16),
      !strconcat(instr_asm, "\t$rt, $rs, $imm16"),
-     [(set RC:$rt, (OpNode RC:$rs, imm_type:$imm16))], IIAlu>;
+     [(set RC:$rt, (OpNode RC:$rs, imm_type:$imm16))], IIAlu> {
+  let isReMaterializable = 1;
+}
 
 class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode,
                      Operand Od, PatLeaf imm_type, RegisterClass RC> :
@@ -365,6 +388,8 @@ class LoadUpper<bits<6> op, string instr_asm, RegisterClass RC, Operand Imm>:
   FI<op, (outs RC:$rt), (ins Imm:$imm16),
      !strconcat(instr_asm, "\t$rt, $imm16"), [], IIAlu> {
   let rs = 0;
+  let neverHasSideEffects = 1;
+  let isReMaterializable = 1;
 }
 
 class FMem<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
@@ -372,6 +397,7 @@ class FMem<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
   bits<21> addr;
   let Inst{25-21} = addr{20-16};
   let Inst{15-0}  = addr{15-0};
+  let DecoderMethod = "DecodeMem";
 }
 
 // Memory Load/Store
@@ -406,7 +432,10 @@ multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode,
   def #NAME# : LoadM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
                Requires<[NotN64]>;
   def _P8    : LoadM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
-               Requires<[IsN64]>;
+               Requires<[IsN64]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
 }
 
 // 64-bit load.
@@ -415,7 +444,10 @@ multiclass LoadM64<bits<6> op, string instr_asm, PatFrag OpNode,
   def #NAME# : LoadM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
                Requires<[NotN64]>;
   def _P8    : LoadM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
-               Requires<[IsN64]>;
+               Requires<[IsN64]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
 }
 
 // 32-bit load.
@@ -423,7 +455,10 @@ multiclass LoadUnAlign32<bits<6> op> {
   def #NAME# : LoadUnAlign<op, CPURegs, mem>,
                Requires<[NotN64]>;
   def _P8    : LoadUnAlign<op, CPURegs, mem64>,
-               Requires<[IsN64]>;
+               Requires<[IsN64]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
 }
 // 32-bit store.
 multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode,
@@ -431,7 +466,10 @@ multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode,
   def #NAME# : StoreM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
                Requires<[NotN64]>;
   def _P8    : StoreM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
-               Requires<[IsN64]>;
+               Requires<[IsN64]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
 }
 
 // 64-bit store.
@@ -440,7 +478,10 @@ multiclass StoreM64<bits<6> op, string instr_asm, PatFrag OpNode,
   def #NAME# : StoreM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
                Requires<[NotN64]>;
   def _P8    : StoreM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
-               Requires<[IsN64]>;
+               Requires<[IsN64]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
 }
 
 // 32-bit store.
@@ -448,7 +489,10 @@ multiclass StoreUnAlign32<bits<6> op> {
   def #NAME# : StoreUnAlign<op, CPURegs, mem>,
                Requires<[NotN64]>;
   def _P8    : StoreUnAlign<op, CPURegs, mem64>,
-               Requires<[IsN64]>;
+               Requires<[IsN64]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
 }
 
 // Conditional Branch
@@ -498,6 +542,7 @@ class JumpFJ<bits<6> op, string instr_asm>:
   let isBarrier=1;
   let hasDelaySlot = 1;
   let Predicates = [RelocStatic];
+  let DecoderMethod = "DecodeJumpTarget";
 }
 
 // Unconditional branch
@@ -528,7 +573,9 @@ let isCall=1, hasDelaySlot=1 in {
   class JumpLink<bits<6> op, string instr_asm>:
     FJ<op, (outs), (ins calltarget:$target, variable_ops),
        !strconcat(instr_asm, "\t$target"), [(MipsJmpLink imm:$target)],
-       IIBranch>;
+       IIBranch> {
+       let DecoderMethod = "DecodeJumpTarget";
+       }
 
   class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm,
                     RegisterClass RC>:
@@ -555,6 +602,7 @@ class Mult<bits<6> func, string instr_asm, InstrItinClass itin,
   let shamt = 0;
   let isCommutable = 1;
   let Defs = DefRegs;
+  let neverHasSideEffects = 1;
 }
 
 class Mult32<bits<6> func, string instr_asm, InstrItinClass itin>:
@@ -582,6 +630,7 @@ class MoveFromLOHI<bits<6> func, string instr_asm, RegisterClass RC,
   let rt = 0;
   let shamt = 0;
   let Uses = UseRegs;
+  let neverHasSideEffects = 1;
 }
 
 class MoveToLOHI<bits<6> func, string instr_asm, RegisterClass RC,
@@ -592,6 +641,7 @@ class MoveToLOHI<bits<6> func, string instr_asm, RegisterClass RC,
   let rd = 0;
   let shamt = 0;
   let Defs = DefRegs;
+  let neverHasSideEffects = 1;
 }
 
 class EffectiveAddress<string instr_asm, RegisterClass RC, Operand Mem> :
@@ -635,6 +685,7 @@ class SubwordSwap<bits<6> func, bits<5> sa, string instr_asm, RegisterClass RC>:
   let rs = 0;
   let shamt = sa;
   let Predicates = [HasSwap];
+  let neverHasSideEffects = 1;
 }
 
 // Read Hardware
@@ -680,7 +731,9 @@ class Atomic2Ops<PatFrag Op, string Opstr, RegisterClass DRC,
 
 multiclass Atomic2Ops32<PatFrag Op, string Opstr> {
   def #NAME# : Atomic2Ops<Op, Opstr, CPURegs, CPURegs>, Requires<[NotN64]>;
-  def _P8    : Atomic2Ops<Op, Opstr, CPURegs, CPU64Regs>, Requires<[IsN64]>;
+  def _P8    : Atomic2Ops<Op, Opstr, CPURegs, CPU64Regs>, Requires<[IsN64]> {
+    let DecoderNamespace = "Mips64";
+  }
 }
 
 // Atomic Compare & Swap.
@@ -692,7 +745,9 @@ class AtomicCmpSwap<PatFrag Op, string Width, RegisterClass DRC,
 
 multiclass AtomicCmpSwap32<PatFrag Op, string Width>  {
   def #NAME# : AtomicCmpSwap<Op, Width, CPURegs, CPURegs>, Requires<[NotN64]>;
-  def _P8    : AtomicCmpSwap<Op, Width, CPURegs, CPU64Regs>, Requires<[IsN64]>;
+  def _P8    : AtomicCmpSwap<Op, Width, CPURegs, CPU64Regs>, Requires<[IsN64]> {
+    let DecoderNamespace = "Mips64";
+  }
 }
 
 class LLBase<bits<6> Opc, string opstring, RegisterClass RC, Operand Mem> :
@@ -722,24 +777,13 @@ def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins uimm16:$amt1, uimm16:$amt2),
                                   [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
-// Some assembly macros need to avoid pseudoinstructions and assembler
-// automatic reodering, we should reorder ourselves.
-def MACRO     : MipsPseudo<(outs), (ins), ".set\tmacro",     []>;
-def REORDER   : MipsPseudo<(outs), (ins), ".set\treorder",   []>;
-def NOMACRO   : MipsPseudo<(outs), (ins), ".set\tnomacro",   []>;
-def NOREORDER : MipsPseudo<(outs), (ins), ".set\tnoreorder", []>;
-
-// These macros are inserted to prevent GAS from complaining
-// when using the AT register.
-def NOAT      : MipsPseudo<(outs), (ins), ".set\tnoat", []>;
-def ATMACRO   : MipsPseudo<(outs), (ins), ".set\tat", []>;
-
 // When handling PIC code the assembler needs .cpload and .cprestore
 // directives. If the real instructions corresponding these directives
 // are used, we have the same behavior, but get also a bunch of warnings
 // from the assembler.
-def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>;
-def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc), ".cprestore\t$loc", []>;
+let neverHasSideEffects = 1 in
+def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc, CPURegs:$gp),
+                           ".cprestore\t$loc", []>;
 
 // For O32 ABI & PIC & non-fixed global base register, the following instruction
 // seqeunce is emitted to set the global base register:
@@ -757,7 +801,10 @@ def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc), ".cprestore\t$loc", []>;
 // before or between instructions 0 and 1, which is a limitation imposed by
 // GNU linker.
 
+let isTerminator = 1, isBarrier = 1 in
 def SETGP01 : MipsPseudo<(outs CPURegs:$dst), (ins), "", []>;
+
+let neverHasSideEffects = 1 in
 def SETGP2 : MipsPseudo<(outs CPURegs:$globalreg), (ins CPURegs:$picreg), "",
                         []>;
 
@@ -871,9 +918,14 @@ def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype",
 
 /// Load-linked, Store-conditional
 def LL    : LLBase<0x30, "ll", CPURegs, mem>, Requires<[NotN64]>;
-def LL_P8 : LLBase<0x30, "ll", CPURegs, mem64>, Requires<[IsN64]>;
+def LL_P8 : LLBase<0x30, "ll", CPURegs, mem64>, Requires<[IsN64]> {
+  let DecoderNamespace = "Mips64";
+}
+
 def SC    : SCBase<0x38, "sc", CPURegs, mem>, Requires<[NotN64]>;
-def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>, Requires<[IsN64]>;
+def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>, Requires<[IsN64]> {
+  let DecoderNamespace = "Mips64";
+}
 
 /// Jump and Branch Instructions
 def J       : JumpFJ<0x02, "j">;
@@ -891,7 +943,7 @@ def JALR : JumpLinkReg<0x00, 0x09, "jalr", CPURegs>;
 def BGEZAL  : BranchLink<"bgezal", 0x11, CPURegs>;
 def BLTZAL  : BranchLink<"bltzal", 0x10, CPURegs>;
 
-let isReturn=1, isTerminator=1, hasDelaySlot=1,
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isCodeGenOnly=1,
     isBarrier=1, hasCtrlDep=1, rd=0, rt=0, shamt=0 in
   def RET : FR <0x00, 0x08, (outs), (ins CPURegs:$target),
                 "jr\t$target", [(MipsRet CPURegs:$target)], IIBranch>;
@@ -926,13 +978,17 @@ let addr=0 in
 // instructions. The same not happens for stack address copies, so an
 // add op with mem ComplexPattern is used and the stack address copy
 // can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : EffectiveAddress<"addiu\t$rt, $addr", CPURegs, mem_ea>;
+def LEA_ADDiu : EffectiveAddress<"addiu\t$rt, $addr", CPURegs, mem_ea> {
+  let isCodeGenOnly = 1;
+}
 
 // DynAlloc node points to dynamically allocated stack space.
 // $sp is added to the list of implicitly used registers to prevent dead code
 // elimination from removing instructions that modify $sp.
 let Uses = [SP] in
-def DynAlloc : EffectiveAddress<"addiu\t$rt, $addr", CPURegs, mem_ea>;
+def DynAlloc : EffectiveAddress<"addiu\t$rt, $addr", CPURegs, mem_ea> {
+  let isCodeGenOnly = 1;
+}
 
 // MADD*/MSUB*
 def MADD  : MArithR<0, "madd", MipsMAdd, 1>;
diff --git a/lib/Target/Mips/MipsJITInfo.h b/lib/Target/Mips/MipsJITInfo.h
index ad3c930..f4c4ae8 100644
--- a/lib/Target/Mips/MipsJITInfo.h
+++ b/lib/Target/Mips/MipsJITInfo.h
@@ -19,8 +19,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/Target/TargetJITInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
 class MipsTargetMachine;
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 0d51298..1597b93 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -26,9 +26,13 @@
 
 using namespace llvm;
 
-MipsMCInstLower::MipsMCInstLower(Mangler *mang, const MachineFunction &mf,
-                                 MipsAsmPrinter &asmprinter)
-  : Ctx(mf.getContext()), Mang(mang), AsmPrinter(asmprinter) {}
+MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
+  : AsmPrinter(asmprinter) {}
+
+void MipsMCInstLower::Initialize(Mangler *M, MCContext* C) {
+  Mang = M;
+  Ctx = C;
+}
 
 MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                               MachineOperandType MOTy,
@@ -90,7 +94,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     llvm_unreachable("<unknown operand type>");
   }
 
-  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, *Ctx);
 
   if (!Offset)
     return MCOperand::CreateExpr(MCSym);
@@ -98,76 +102,68 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   // Assume offset is never negative.
   assert(Offset > 0);
 
-  const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, Ctx);
-  const MCBinaryExpr *AddExpr = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, Ctx);
+  const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
+  const MCBinaryExpr *AddExpr = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
   return MCOperand::CreateExpr(AddExpr);
 }
 
+static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand& Opnd0,
+                         const MCOperand& Opnd1,
+                         const MCOperand& Opnd2 = MCOperand()) {
+  Inst.setOpcode(Opc);
+  Inst.addOperand(Opnd0);
+  Inst.addOperand(Opnd1);
+  if (Opnd2.isValid())
+    Inst.addOperand(Opnd2);
+}
+
 // Lower ".cpload $reg" to
 //  "lui   $gp, %hi(_gp_disp)"
 //  "addiu $gp, $gp, %lo(_gp_disp)"
-//  "addu  $gp. $gp, $reg"
-void MipsMCInstLower::LowerCPLOAD(const MachineInstr *MI,
-                                  SmallVector<MCInst, 4>& MCInsts) {
-  MCInst Lui, Addiu, Addu;
+//  "addu  $gp, $gp, $t9"
+void MipsMCInstLower::LowerCPLOAD(SmallVector<MCInst, 4>& MCInsts) {
+  MCOperand GPReg = MCOperand::CreateReg(Mips::GP);
+  MCOperand T9Reg = MCOperand::CreateReg(Mips::T9);
   StringRef SymName("_gp_disp");
-  const MCSymbol *Symbol = Ctx.GetOrCreateSymbol(SymName);
+  const MCSymbol *Sym = Ctx->GetOrCreateSymbol(SymName);
   const MCSymbolRefExpr *MCSym;
 
-  // lui   $gp, %hi(_gp_disp)
-  Lui.setOpcode(Mips::LUi);
-  Lui.addOperand(MCOperand::CreateReg(Mips::GP));
-  MCSym = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_Mips_ABS_HI, Ctx);
-  Lui.addOperand(MCOperand::CreateExpr(MCSym));
-  MCInsts.push_back(Lui);
-
-  // addiu $gp, $gp, %lo(_gp_disp)
-  Addiu.setOpcode(Mips::ADDiu);
-  Addiu.addOperand(MCOperand::CreateReg(Mips::GP));
-  Addiu.addOperand(MCOperand::CreateReg(Mips::GP));
-  MCSym = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_Mips_ABS_LO, Ctx);
-  Addiu.addOperand(MCOperand::CreateExpr(MCSym));
-  MCInsts.push_back(Addiu);
-
-  // addu  $gp. $gp, $reg
-  Addu.setOpcode(Mips::ADDu);
-  Addu.addOperand(MCOperand::CreateReg(Mips::GP));
-  Addu.addOperand(MCOperand::CreateReg(Mips::GP));
-  const MachineOperand &MO = MI->getOperand(0);
-  assert(MO.isReg() && "CPLOAD's operand must be a register.");
-  Addu.addOperand(MCOperand::CreateReg(MO.getReg()));
-  MCInsts.push_back(Addu);
+  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_HI, *Ctx);
+  MCOperand SymHi = MCOperand::CreateExpr(MCSym);
+  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_LO, *Ctx);
+  MCOperand SymLo = MCOperand::CreateExpr(MCSym);
+
+  MCInsts.resize(3);
+
+  CreateMCInst(MCInsts[0], Mips::LUi, GPReg, SymHi);
+  CreateMCInst(MCInsts[1], Mips::ADDiu, GPReg, GPReg, SymLo);
+  CreateMCInst(MCInsts[2], Mips::ADDu, GPReg, GPReg, T9Reg);
 }
 
 // Lower ".cprestore offset" to "sw $gp, offset($sp)".
-void MipsMCInstLower::LowerCPRESTORE(const MachineInstr *MI,
+void MipsMCInstLower::LowerCPRESTORE(int64_t Offset,
                                      SmallVector<MCInst, 4>& MCInsts) {
-  const MachineOperand &MO = MI->getOperand(0);
-  assert(MO.isImm() && "CPRESTORE's operand must be an immediate.");
-  unsigned Offset = MO.getImm(), Reg = Mips::SP;
-  MCInst Sw;
+  assert(isInt<32>(Offset) && (Offset >= 0) &&
+         "Imm operand of .cprestore must be a non-negative 32-bit value.");
 
-  if (Offset >= 0x8000) {
-    unsigned Hi = (Offset >> 16) + ((Offset & 0x8000) != 0);
+  MCOperand SPReg = MCOperand::CreateReg(Mips::SP), BaseReg = SPReg;
+  MCOperand GPReg = MCOperand::CreateReg(Mips::GP);
+
+  if (!isInt<16>(Offset)) {
+    unsigned Hi = ((Offset + 0x8000) >> 16) & 0xffff;
     Offset &= 0xffff;
-    Reg = Mips::AT;
+    MCOperand ATReg = MCOperand::CreateReg(Mips::AT);
+    BaseReg = ATReg;
 
     // lui   at,hi
     // addu  at,at,sp
     MCInsts.resize(2);
-    MCInsts[0].setOpcode(Mips::LUi);
-    MCInsts[0].addOperand(MCOperand::CreateReg(Mips::AT));
-    MCInsts[0].addOperand(MCOperand::CreateImm(Hi));
-    MCInsts[1].setOpcode(Mips::ADDu);
-    MCInsts[1].addOperand(MCOperand::CreateReg(Mips::AT));
-    MCInsts[1].addOperand(MCOperand::CreateReg(Mips::AT));
-    MCInsts[1].addOperand(MCOperand::CreateReg(Mips::SP));
+    CreateMCInst(MCInsts[0], Mips::LUi, ATReg, MCOperand::CreateImm(Hi));
+    CreateMCInst(MCInsts[1], Mips::ADDu, ATReg, ATReg, SPReg);
   }
 
-  Sw.setOpcode(Mips::SW);
-  Sw.addOperand(MCOperand::CreateReg(Mips::GP));
-  Sw.addOperand(MCOperand::CreateReg(Reg));
-  Sw.addOperand(MCOperand::CreateImm(Offset));
+  MCInst Sw;
+  CreateMCInst(Sw, Mips::SW, GPReg, BaseReg, MCOperand::CreateImm(Offset));
   MCInsts.push_back(Sw);
 }
 
@@ -332,18 +328,16 @@ void MipsMCInstLower::LowerSETGP01(const MachineInstr *MI,
   assert(MO.isReg());
   MCOperand RegOpnd = MCOperand::CreateReg(MO.getReg());
   StringRef SymName("_gp_disp");
-  const MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+  const MCSymbol *Sym = Ctx->GetOrCreateSymbol(SymName);
   const MCSymbolRefExpr *MCSym;
 
+  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_HI, *Ctx);
+  MCOperand SymHi = MCOperand::CreateExpr(MCSym);
+  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_LO, *Ctx);
+  MCOperand SymLo = MCOperand::CreateExpr(MCSym);
+
   MCInsts.resize(2);
 
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_HI, Ctx);
-  MCInsts[0].setOpcode(Mips::LUi);
-  MCInsts[0].addOperand(RegOpnd);
-  MCInsts[0].addOperand(MCOperand::CreateExpr(MCSym));
-  MCSym = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_Mips_ABS_LO, Ctx);
-  MCInsts[1].setOpcode(Mips::ADDiu);
-  MCInsts[1].addOperand(RegOpnd);
-  MCInsts[1].addOperand(RegOpnd);
-  MCInsts[1].addOperand(MCOperand::CreateExpr(MCSym));
+  CreateMCInst(MCInsts[0], Mips::LUi, RegOpnd, SymHi);
+  CreateMCInst(MCInsts[1], Mips::ADDiu, RegOpnd, RegOpnd, SymLo);
 }
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 20bb338..c1d007d 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -26,15 +26,15 @@ namespace llvm {
 //                    MCInst.
 class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
   typedef MachineOperand::MachineOperandType MachineOperandType;
-  MCContext &Ctx;
+  MCContext *Ctx;
   Mangler *Mang;
   MipsAsmPrinter &AsmPrinter;
 public:
-  MipsMCInstLower(Mangler *mang, const MachineFunction &MF,
-                  MipsAsmPrinter &asmprinter);
+  MipsMCInstLower(MipsAsmPrinter &asmprinter);
+  void Initialize(Mangler *mang, MCContext* C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-  void LowerCPLOAD(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
-  void LowerCPRESTORE(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
+  void LowerCPLOAD(SmallVector<MCInst, 4>& MCInsts);
+  void LowerCPRESTORE(int64_t Offset, SmallVector<MCInst, 4>& MCInsts);
   void LowerUnalignedLoadStore(const MachineInstr *MI,
                                SmallVector<MCInst, 4>& MCInsts);
   void LowerSETGP01(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index abb5404..0fde55c 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -14,7 +14,6 @@
 #ifndef MIPS_MACHINE_FUNCTION_INFO_H
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include <utility>
@@ -50,12 +49,14 @@ class MipsFunctionInfo : public MachineFunctionInfo {
   mutable int DynAllocFI; // Frame index of dynamically allocated stack area.
   unsigned MaxCallFrameSize;
 
+  bool EmitNOAT;
+
 public:
   MipsFunctionInfo(MachineFunction& MF)
   : MF(MF), SRetReturnReg(0), GlobalBaseReg(0),
     VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)),
     OutArgFIRange(std::make_pair(-1, 0)), GPFI(0), DynAllocFI(0),
-    MaxCallFrameSize(0)
+    MaxCallFrameSize(0), EmitNOAT(false)
   {}
 
   bool isInArgFI(int FI) const {
@@ -100,6 +101,9 @@ public:
 
   unsigned getMaxCallFrameSize() const { return MaxCallFrameSize; }
   void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
+
+  bool getEmitNOAT() const { return EmitNOAT; }
+  void setEmitNOAT() { EmitNOAT = true; }
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 5cfda34..f30de44 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -62,7 +62,7 @@ getCalleeSavedRegs(const MachineFunction *MF) const
     return CSR_O32_SaveList;
   else if (Subtarget.isABI_N32())
     return CSR_N32_SaveList;
-  
+
   assert(Subtarget.isABI_N64());
   return CSR_N64_SaveList;  
 }
@@ -125,9 +125,18 @@ getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(Mips::GP_64);
   }
 
+  // Reserve hardware registers.
+  Reserved.set(Mips::HWR29);
+  Reserved.set(Mips::HWR29_64);
+
   return Reserved;
 }
 
+bool
+MipsRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
 // This function eliminate ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
 void MipsRegisterInfo::
@@ -223,8 +232,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
       AnalyzeImm.Analyze(Offset, Size, true /* LastInstrIsADDiu */);
     MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
 
-    // FIXME: change this when mips goes MC".
-    BuildMI(MBB, II, DL, TII.get(Mips::NOAT));
+    MipsFI->setEmitNOAT();
 
     // The first instruction can be a LUi, which is different from other
     // instructions (ADDiu, ORI and SLL) in that it does not have a register
@@ -245,7 +253,6 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
     FrameReg = ATReg;
     Offset = SignExtend64<16>(Inst->ImmOpnd);
-    BuildMI(MBB, ++II, MI.getDebugLoc(), TII.get(Mips::ATMACRO));
   }
 
   MI.getOperand(i).ChangeToRegister(FrameReg, false);
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 7037ca6..0716d29 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -47,6 +47,8 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
+  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index d4a50ee..00347df 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -13,6 +13,7 @@
 
 #include "MipsSubtarget.h"
 #include "Mips.h"
+#include "MipsRegisterInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
@@ -54,3 +55,14 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
 }
+
+bool
+MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                                    TargetSubtargetInfo::AntiDepBreakMode& Mode,
+                                     RegClassVector& CriticalPathRCs) const {
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(hasMips64() ?
+                            &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass);
+  return OptLevel >= CodeGenOpt::Aggressive;
+}
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index ba0bbac..7faf77b 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -89,6 +89,9 @@ protected:
   InstrItineraryData InstrItins;
 
 public:
+  virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                                     AntiDepBreakMode& Mode,
+                                     RegClassVector& CriticalPathRCs) const;
 
   /// Only O32 and EABI supported right now.
   bool isABI_EABI() const { return MipsABI == EABI; }
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
index ec7e2a7..1830213 100644
--- a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
+++ b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
@@ -18,26 +18,24 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-#define GET_INSTRUCTION_NAME
 #include "PTXGenAsmWriter.inc"
 
 PTXInstPrinter::PTXInstPrinter(const MCAsmInfo &MAI,
+                               const MCInstrInfo &MII,
                                const MCRegisterInfo &MRI,
                                const MCSubtargetInfo &STI) :
-  MCInstPrinter(MAI, MRI) {
+  MCInstPrinter(MAI, MII, MRI) {
   // Initialize the set of available features.
   setAvailableFeatures(STI.getFeatureBits());
 }
 
-StringRef PTXInstPrinter::getOpcodeName(unsigned Opcode) const {
-  return getInstructionName(Opcode);
-}
-
 void PTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   // Decode the register number into type and offset
   unsigned RegSpace  = RegNo & 0x7;
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h b/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
index eef6101..ea4d504 100644
--- a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
+++ b/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
@@ -23,15 +23,12 @@ class MCOperand;
 
 class PTXInstPrinter : public MCInstPrinter {
 public:
-  PTXInstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI,
-                 const MCSubtargetInfo &STI);
+  PTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-  virtual StringRef getOpcodeName(unsigned Opcode) const;
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
-  static const char *getInstructionName(unsigned Opcode);
-
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
index 7671b11..08fb970 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
+++ b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
@@ -62,10 +62,11 @@ static MCCodeGenInfo *createPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
 static MCInstPrinter *createPTXMCInstPrinter(const Target &T,
                                              unsigned SyntaxVariant,
                                              const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
                                              const MCRegisterInfo &MRI,
                                              const MCSubtargetInfo &STI) {
   assert(SyntaxVariant == 0 && "We only have one syntax variant");
-  return new PTXInstPrinter(MAI, MRI, STI);
+  return new PTXInstPrinter(MAI, MII, MRI, STI);
 }
 
 extern "C" void LLVMInitializePTXTargetMC() {
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h
index 1003b0b..542638a 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h
+++ b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.h
@@ -15,9 +15,7 @@
 #define PTXMCTARGETDESC_H
 
 namespace llvm {
-class MCSubtargetInfo;
 class Target;
-class StringRef;
 
 extern Target ThePTX32Target;
 extern Target ThePTX64Target;
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index db1c953..ef4455b 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -97,7 +97,8 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
 
   // customise setcc to use bitwise logic if possible
 
-  setOperationAction(ISD::SETCC, MVT::i1, Custom);
+  //setOperationAction(ISD::SETCC, MVT::i1, Custom);
+  setOperationAction(ISD::SETCC, MVT::i1, Legal);
 
   // customize translation of memory addresses
 
@@ -156,18 +157,27 @@ SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Op1 = Op.getOperand(1);
   SDValue Op2 = Op.getOperand(2);
   DebugLoc dl = Op.getDebugLoc();
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  //ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
   // Look for X == 0, X == 1, X != 0, or X != 1
   // We can simplify these to bitwise logic
 
-  if (Op1.getOpcode() == ISD::Constant &&
-      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
-       cast<ConstantSDNode>(Op1)->isNullValue()) &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+  //if (Op1.getOpcode() == ISD::Constant &&
+  //    (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+  //     cast<ConstantSDNode>(Op1)->isNullValue()) &&
+  //    (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+  //
+  //  return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1);
+  //}
 
-    return DAG.getNode(ISD::AND, dl, MVT::i1, Op0, Op1);
-  }
+  //ConstantSDNode* COp1 = cast<ConstantSDNode>(Op1);
+  //if(COp1 && COp1->getZExtValue() == 1) {
+  //  if(CC == ISD::SETNE) {
+  //    return DAG.getNode(PTX::XORripreds, dl, MVT::i1, Op0);
+  //  }
+  //}
+
+  llvm_unreachable("setcc was not matched by a pattern!");
 
   return DAG.getNode(ISD::SETCC, dl, MVT::i1, Op0, Op1, Op2);
 }
@@ -384,22 +394,22 @@ PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   PTXMachineFunctionInfo *PTXMFI = MF.getInfo<PTXMachineFunctionInfo>();
   PTXParamManager &PM = PTXMFI->getParamManager();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  
+
   assert(getTargetMachine().getSubtarget<PTXSubtarget>().callsAreHandled() &&
          "Calls are not handled for the target device");
 
   // Identify the callee function
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
   const Function *function = cast<Function>(GV);
-  
+
   // allow non-device calls only for printf
-  bool isPrintf = function->getName() == "printf" || function->getName() == "puts";	
-  
+  bool isPrintf = function->getName() == "printf" || function->getName() == "puts";
+
   assert((isPrintf || function->getCallingConv() == CallingConv::PTX_Device) &&
 			 "PTX function calls must be to PTX device functions");
-  
+
   unsigned outSize = isPrintf ? 2 : Outs.size();
-  
+
   std::vector<SDValue> Ops;
   // The layout of the ops will be [Chain, #Ins, Ins, Callee, #Outs, Outs]
   Ops.resize(outSize + Ins.size() + 4);
@@ -412,7 +422,7 @@ PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // #Outs
   Ops[Ins.size()+3] = DAG.getTargetConstant(outSize, MVT::i32);
-  
+
   if (isPrintf) {
     // first argument is the address of the global string variable in memory
     unsigned Param0 = PM.addLocalParam(getPointerTy().getSizeInBits());
@@ -421,29 +431,29 @@ PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
                         ParamValue0, OutVals[0]);
     Ops[Ins.size()+4] = ParamValue0;
-      
+
     // alignment is the maximum size of all the arguments
     unsigned alignment = 0;
     for (unsigned i = 1; i < OutVals.size(); ++i) {
-      alignment = std::max(alignment, 
+      alignment = std::max(alignment,
     		               OutVals[i].getValueType().getSizeInBits());
     }
 
     // size is the alignment multiplied by the number of arguments
     unsigned size = alignment * (OutVals.size() - 1);
-  
+
     // second argument is the address of the stack object (unless no arguments)
     unsigned Param1 = PM.addLocalParam(getPointerTy().getSizeInBits());
     SDValue ParamValue1 = DAG.getTargetExternalSymbol(PM.getParamName(Param1).c_str(),
                                                       MVT::Other);
     Ops[Ins.size()+5] = ParamValue1;
-    
+
     if (size > 0)
     {
       // create a local stack object to store the arguments
       unsigned StackObject = MFI->CreateStackObject(size / 8, alignment / 8, false);
       SDValue FrameIndex = DAG.getFrameIndex(StackObject, getPointerTy());
-	  
+
       // store each of the arguments to the stack in turn
       for (unsigned int i = 1; i != OutVals.size(); i++) {
         SDValue FrameAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameIndex, DAG.getTargetConstant((i - 1) * 8, getPointerTy()));
@@ -475,7 +485,7 @@ PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 		Ops[i+Ins.size()+4] = ParamValue;
 	  }
   }
-  
+
   std::vector<SDValue> InParams;
 
   // Generate list of .param variables to hold the return value(s).
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
index 818d444..bead428 100644
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ b/lib/Target/PTX/PTXInstrInfo.td
@@ -808,6 +808,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 let isBranch = 1, isTerminator = 1 in {
   // FIXME: The pattern part is blank because I cannot (or do not yet know
   // how to) use the first operand of PredicateOperand (a RegPred register) here
+  // When this is revisited, make sure to also look at LowerSETCC and try to
+  // fold it into negated predicates, if possible.
   def BRAdp
     : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
               [/*(brcond pred:$_p, bb:$d)*/]>;
@@ -1017,6 +1019,9 @@ def : Pat<(f64 (sint_to_fp RegI64:$a)), (CVTf64s64 RndDefault, RegI64:$a)>;
 def : Pat<(f64 (fextend RegF32:$a)),    (CVTf64f32 RegF32:$a)>;
 def : Pat<(f64 (bitconvert RegI64:$a)), (MOVf64i64 RegI64:$a)>;
 
+// setcc - predicate inversion for branch conditions
+def : Pat<(i1 (setcc RegPred:$a, imm:$b, SETNE)),
+          (XORripreds RegPred:$a, imm:$b)>;
 
 ///===- Intrinsic Instructions --------------------------------------------===//
 include "PTXIntrinsicInstrInfo.td"
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
index 40835d0..c55a658 100644
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ b/lib/Target/PTX/PTXTargetMachine.cpp
@@ -17,7 +17,6 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -153,10 +152,10 @@ bool PTXPassConfig::addPostRegAlloc() {
 /// Add passes that optimize machine instructions after register allocation.
 void PTXPassConfig::addMachineLateOptimization() {
   if (addPass(BranchFolderPassID) != &NoPassID)
-    printNoVerify("After BranchFolding");
+    printAndVerify("After BranchFolding");
 
   if (addPass(TailDuplicateID) != &NoPassID)
-    printNoVerify("After TailDuplicate");
+    printAndVerify("After TailDuplicate");
 }
 
 bool PTXPassConfig::addPreEmitPass() {
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 000d6d4..61d23ce 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -17,16 +17,12 @@
 #include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-#define GET_INSTRUCTION_NAME
 #include "PPCGenAsmWriter.inc"
 
-StringRef PPCInstPrinter::getOpcodeName(unsigned Opcode) const {
-  return getInstructionName(Opcode);
-}
-
 void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 21fc733..73fd534 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -24,9 +24,9 @@ class PPCInstPrinter : public MCInstPrinter {
   // 0 -> AIX, 1 -> Darwin.
   unsigned SyntaxVariant;
 public:
-  PPCInstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI,
-                 unsigned syntaxVariant)
-    : MCInstPrinter(MAI, MRI), SyntaxVariant(syntaxVariant) {}
+  PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI, unsigned syntaxVariant)
+    : MCInstPrinter(MAI, MII, MRI), SyntaxVariant(syntaxVariant) {}
   
   bool isDarwinSyntax() const {
     return SyntaxVariant == 1;
@@ -34,9 +34,6 @@ public:
   
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-  virtual StringRef getOpcodeName(unsigned Opcode) const;
-  
-  static const char *getInstructionName(unsigned Opcode);
   
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 9c6eefe..48de583 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -11,6 +11,7 @@
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCObjectWriter.h"
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 226fbfe..6568e82 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -108,9 +108,10 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
                                              unsigned SyntaxVariant,
                                              const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
                                              const MCRegisterInfo &MRI,
                                              const MCSubtargetInfo &STI) {
-  return new PPCInstPrinter(MAI, MRI, SyntaxVariant);
+  return new PPCInstPrinter(MAI, MII, MRI, SyntaxVariant);
 }
 
 extern "C" void LLVMInitializePowerPCTargetMC() {
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 724374c..c554d39 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -34,6 +34,7 @@ def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">;
 def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
 def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
 def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
+def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
 
 def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
                                         "Enable 64-bit instructions">;
@@ -87,6 +88,10 @@ def : Processor<"g5", G5Itineraries,
                   [Directive970, FeatureAltivec,
                    FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"a2",  PPCA2Itineraries, [DirectiveA2, FeatureBookE,
+                                          FeatureFSqrt, FeatureSTFIWX,
+                                          Feature64Bit
+                                      /*, Feature64BitRegs */]>;
 def : Processor<"ppc", G3Itineraries, [Directive32]>;
 def : Processor<"ppc64", G5Itineraries,
                   [Directive64, FeatureAltivec,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 4abb469..fb7aa71 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -450,6 +450,7 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "ppc7400",
     "ppc750",
     "ppc970",
+    "ppcA2",
     "ppc64"
   };
 
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index 9883c2e..b2b5364 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -12,10 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-/// CCIfSubtarget - Match if the current subtarget has a feature F.
-class CCIfSubtarget<string F, CCAction A>
- : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
-
 //===----------------------------------------------------------------------===//
 // Return Value Calling Convention
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index ae317af..6ed1fb9 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -22,17 +22,29 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// PowerPC 440 Hazard Recognizer
-void PPCHazardRecognizer440::EmitInstruction(SUnit *SU) {
+// PowerPC Scoreboard Hazard Recognizer
+void PPCScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
   const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
-  if (!MCID) {
+  if (!MCID)
     // This is a PPC pseudo-instruction.
     return;
-  }
 
   ScoreboardHazardRecognizer::EmitInstruction(SU);
 }
 
+ScheduleHazardRecognizer::HazardType
+PPCScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+void PPCScoreboardHazardRecognizer::AdvanceCycle() {
+  ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void PPCScoreboardHazardRecognizer::Reset() {
+  ScoreboardHazardRecognizer::Reset();
+}
+
 //===----------------------------------------------------------------------===//
 // PowerPC 970 Hazard Recognizer
 //
@@ -61,7 +73,6 @@ void PPCHazardRecognizer440::EmitInstruction(SUnit *SU) {
 
 PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii)
   : TII(tii) {
-  LastWasBL8_ELF = false;
   EndDispatchGroup();
 }
 
@@ -132,15 +143,6 @@ getHazardType(SUnit *SU, int Stalls) {
     return NoHazard;
 
   unsigned Opcode = MI->getOpcode();
-
-  // If the last instruction was a BL8_ELF, then the NOP must follow it
-  // directly (this is strong requirement from the linker due to the ELF ABI).
-  // We return only Hazard (and not NoopHazard) because if the NOP is necessary
-  // then it will already be in the instruction stream (it is not always
-  // necessary; tail calls, for example, do not need it).
-  if (LastWasBL8_ELF && Opcode != PPC::NOP)
-    return Hazard;
-
   bool isFirst, isSingle, isCracked, isLoad, isStore;
   PPCII::PPC970_Unit InstrType =
     GetInstrType(Opcode, isFirst, isSingle, isCracked,
@@ -199,8 +201,6 @@ void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
     return;
 
   unsigned Opcode = MI->getOpcode();
-  LastWasBL8_ELF = (Opcode == PPC::BL8_ELF);
-
   bool isFirst, isSingle, isCracked, isLoad, isStore;
   PPCII::PPC970_Unit InstrType =
     GetInstrType(Opcode, isFirst, isSingle, isCracked,
@@ -240,7 +240,6 @@ void PPCHazardRecognizer970::AdvanceCycle() {
 }
 
 void PPCHazardRecognizer970::Reset() {
-  LastWasBL8_ELF = false;
   EndDispatchGroup();
 }
 
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index d80a385..55b45d0 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -21,16 +21,19 @@
 
 namespace llvm {
 
-/// PPCHazardRecognizer440 - This class implements a scoreboard-based
-/// hazard recognizer for the PPC 440 and friends.
-class PPCHazardRecognizer440 : public ScoreboardHazardRecognizer {
+/// PPCScoreboardHazardRecognizer - This class implements a scoreboard-based
+/// hazard recognizer for generic PPC processors.
+class PPCScoreboardHazardRecognizer : public ScoreboardHazardRecognizer {
   const ScheduleDAG *DAG;
 public:
-  PPCHazardRecognizer440(const InstrItineraryData *ItinData,
+  PPCScoreboardHazardRecognizer(const InstrItineraryData *ItinData,
                          const ScheduleDAG *DAG_) :
     ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_) {}
 
+  virtual HazardType getHazardType(SUnit *SU, int Stalls);
   virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  virtual void Reset();
 };
 
 /// PPCHazardRecognizer970 - This class defines a finite state automata that
@@ -49,9 +52,6 @@ class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
   // HasCTRSet - If the CTR register is set in this group, disallow BCTRL.
   bool HasCTRSet;
 
-  // Was the last instruction issued a BL8_ELF
-  bool LastWasBL8_ELF;
-
   // StoredPtr - Keep track of the address of any store.  If we see a load from
   // the same address (or one that aliases it), disallow the store.  We can have
   // up to four stores in one dispatch group, hence we track up to 4.
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 6651d14..5a04888 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -377,8 +377,8 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
 
   APInt LKZ, LKO, RKZ, RKO;
-  CurDAG->ComputeMaskedBits(Op0, APInt::getAllOnesValue(32), LKZ, LKO);
-  CurDAG->ComputeMaskedBits(Op1, APInt::getAllOnesValue(32), RKZ, RKO);
+  CurDAG->ComputeMaskedBits(Op0, LKZ, LKO);
+  CurDAG->ComputeMaskedBits(Op1, RKZ, RKO);
 
   unsigned TargetMask = LKZ.getZExtValue();
   unsigned InsertMask = RKZ.getZExtValue();
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 85b5bc1..3b24951 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -226,11 +226,23 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 
-  // VAARG is custom lowered with the 32-bit SVR4 ABI.
-  if (TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
-      && !TM.getSubtarget<PPCSubtarget>().isPPC64()) {
-    setOperationAction(ISD::VAARG, MVT::Other, Custom);
-    setOperationAction(ISD::VAARG, MVT::i64, Custom);
+  if (TM.getSubtarget<PPCSubtarget>().isSVR4ABI()) {
+    if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+      // VAARG always uses double-word chunks, so promote anything smaller.
+      setOperationAction(ISD::VAARG, MVT::i1, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i8, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i16, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i32, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::Other, Expand);
+    } else {
+      // VAARG is custom lowered with the 32-bit SVR4 ABI.
+      setOperationAction(ISD::VAARG, MVT::Other, Custom);
+      setOperationAction(ISD::VAARG, MVT::i64, Custom);
+    }
   } else
     setOperationAction(ISD::VAARG, MVT::Other, Expand);
 
@@ -377,6 +389,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
   }
 
+  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport())
+    setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+
   setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
   setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
 
@@ -431,7 +446,16 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
   // Darwin passes everything on 4 byte boundary.
   if (TM.getSubtarget<PPCSubtarget>().isDarwin())
     return 4;
-  // FIXME SVR4 TBD
+
+  // 16byte and wider vectors are passed on 16byte boundary.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    if (VTy->getBitWidth() >= 128)
+      return 16;
+
+  // The rest is 8 on PPC64 and 4 on PPC32 boundary.
+   if (PPCSubTarget.isPPC64())
+     return 8;
+
   return 4;
 }
 
@@ -460,6 +484,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::EXTSW_32:        return "PPCISD::EXTSW_32";
   case PPCISD::STD_32:          return "PPCISD::STD_32";
   case PPCISD::CALL_SVR4:       return "PPCISD::CALL_SVR4";
+  case PPCISD::CALL_NOP_SVR4:   return "PPCISD::CALL_NOP_SVR4";
   case PPCISD::CALL_Darwin:     return "PPCISD::CALL_Darwin";
   case PPCISD::NOP:             return "PPCISD::NOP";
   case PPCISD::MTCTR:           return "PPCISD::MTCTR";
@@ -835,14 +860,10 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
     APInt LHSKnownZero, LHSKnownOne;
     APInt RHSKnownZero, RHSKnownOne;
     DAG.ComputeMaskedBits(N.getOperand(0),
-                          APInt::getAllOnesValue(N.getOperand(0)
-                            .getValueSizeInBits()),
                           LHSKnownZero, LHSKnownOne);
 
     if (LHSKnownZero.getBoolValue()) {
       DAG.ComputeMaskedBits(N.getOperand(1),
-                            APInt::getAllOnesValue(N.getOperand(1)
-                              .getValueSizeInBits()),
                             RHSKnownZero, RHSKnownOne);
       // If all of the bits are known zero on the LHS or RHS, the add won't
       // carry.
@@ -897,10 +918,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
       APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0),
-                            APInt::getAllOnesValue(N.getOperand(0)
-                                                   .getValueSizeInBits()),
-                            LHSKnownZero, LHSKnownOne);
+      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
 
       if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
         // If all of the bits are known zero on the LHS or RHS, the add won't
@@ -1013,10 +1031,7 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
       APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0),
-                            APInt::getAllOnesValue(N.getOperand(0)
-                                                   .getValueSizeInBits()),
-                            LHSKnownZero, LHSKnownOne);
+      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
       if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
         // If all of the bits are known zero on the LHS or RHS, the add won't
         // carry.
@@ -2801,9 +2816,6 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
     return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, &Ops[0], Ops.size());
   }
 
-  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
-  InFlag = Chain.getValue(1);
-
   // Add a NOP immediately after the branch instruction when using the 64-bit
   // SVR4 ABI. At link time, if caller and callee are in a different module and
   // thus have a different TOC, the call will be replaced with a call to a stub
@@ -2812,8 +2824,9 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
   // which restores the TOC of the caller from the TOC save slot of the current
   // stack frame. If caller and callee belong to the same module (and have the
   // same TOC), the NOP will remain unchanged.
+
+  bool needsTOCRestore = false;
   if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) {
-    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
     if (CallOpc == PPCISD::BCTRL_SVR4) {
       // This is a call through a function pointer.
       // Restore the caller TOC from the save area into R2.
@@ -2824,14 +2837,22 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
       // since r2 is a reserved register (which prevents the register allocator
       // from allocating it), resulting in an additional register being
       // allocated and an unnecessary move instruction being generated.
-      Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag);
-      InFlag = Chain.getValue(1);
-    } else {
+      needsTOCRestore = true;
+    } else if (CallOpc == PPCISD::CALL_SVR4) {
       // Otherwise insert NOP.
-      InFlag = DAG.getNode(PPCISD::NOP, dl, MVT::Glue, InFlag);
+      CallOpc = PPCISD::CALL_NOP_SVR4;
     }
   }
 
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  if (needsTOCRestore) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                              DAG.getIntPtrConstant(BytesCalleePops, true),
                              InFlag);
@@ -5486,12 +5507,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 //===----------------------------------------------------------------------===//
 
 void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       const APInt &Mask,
                                                        APInt &KnownZero,
                                                        APInt &KnownOne,
                                                        const SelectionDAG &DAG,
                                                        unsigned Depth) const {
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
   switch (Op.getOpcode()) {
   default: break;
   case PPCISD::LBRX: {
@@ -5725,7 +5745,7 @@ bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,Type *Ty) const{
   return (V > -(1 << 16) && V < (1 << 16)-1);
 }
 
-bool PPCTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
+bool PPCTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
   return false;
 }
 
@@ -5818,3 +5838,12 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
     return MVT::i32;
   }
 }
+
+Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
+  unsigned Directive = PPCSubTarget.getDarwinDirective();
+  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2)
+    return Sched::ILP;
+
+  return TargetLowering::getSchedulingPreference(N);
+}
+
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 2e046c4..18eb072 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -95,7 +95,9 @@ namespace llvm {
       EXTSW_32,
 
       /// CALL - A direct function call.
-      CALL_Darwin, CALL_SVR4,
+      /// CALL_NOP_SVR4 is a call with the special  NOP which follows 64-bit
+      /// SVR4 calls.
+      CALL_Darwin, CALL_SVR4, CALL_NOP_SVR4,
 
       /// NOP - Special NOP which follows 64-bit SVR4 calls.
       NOP,
@@ -279,6 +281,7 @@ namespace llvm {
     bool SelectAddressRegImmShift(SDValue N, SDValue &Disp, SDValue &Base,
                                   SelectionDAG &DAG) const;
 
+    Sched::Preference getSchedulingPreference(SDNode *N) const;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
@@ -293,7 +296,6 @@ namespace llvm {
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
     virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                const APInt &Mask,
                                                 APInt &KnownZero,
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 78f3596..7f67a41 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -89,10 +89,22 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
   let Uses = [RM] in {
     def BL8_ELF  : IForm<18, 0, 1,
                          (outs), (ins calltarget:$func, variable_ops), 
-                         "bl $func", BrB, []>;  // See Pat patterns below.                            
+                         "bl $func", BrB, []>;  // See Pat patterns below.
+
+    let isCodeGenOnly = 1 in
+    def BL8_NOP_ELF  : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                             (outs), (ins calltarget:$func, variable_ops), 
+                             "bl $func\n\tnop", BrB, []>;
+
     def BLA8_ELF : IForm<18, 1, 1,
                          (outs), (ins aaddr:$func, variable_ops),
                          "bla $func", BrB, [(PPCcall_SVR4 (i64 imm:$func))]>;
+
+    let isCodeGenOnly = 1 in
+    def BLA8_NOP_ELF : IForm_and_DForm_4_zero<18, 1, 1, 24,
+                             (outs), (ins aaddr:$func, variable_ops),
+                             "bla $func\n\tnop", BrB,
+                             [(PPCcall_nop_SVR4 (i64 imm:$func))]>;
   }
   let Uses = [X11, CTR8, RM] in {
     def BCTRL8_ELF : XLForm_2_ext<19, 528, 20, 0, 1,
@@ -111,8 +123,14 @@ def : Pat<(PPCcall_Darwin (i64 texternalsym:$dst)),
 
 def : Pat<(PPCcall_SVR4 (i64 tglobaladdr:$dst)),
           (BL8_ELF tglobaladdr:$dst)>;
+def : Pat<(PPCcall_nop_SVR4 (i64 tglobaladdr:$dst)),
+          (BL8_NOP_ELF tglobaladdr:$dst)>;
+
 def : Pat<(PPCcall_SVR4 (i64 texternalsym:$dst)),
           (BL8_ELF texternalsym:$dst)>;
+def : Pat<(PPCcall_nop_SVR4 (i64 texternalsym:$dst)),
+          (BL8_NOP_ELF texternalsym:$dst)>;
+
 def : Pat<(PPCnop),
           (NOP)>;
 
@@ -506,7 +524,7 @@ def LWAX : XForm_1<31, 341, (outs G8RC:$rD), (ins memrr:$src),
 let mayLoad = 1 in
 def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp,
                             ptr_rc:$rA),
-                    "lhau $rD, $disp($rA)", LdStGeneral,
+                    "lhau $rD, $disp($rA)", LdStLoad,
                     []>, RegConstraint<"$rA = $ea_result">,
                     NoEncode<"$ea_result">;
 // NO LWAU!
@@ -516,38 +534,38 @@ def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp
 // Zero extending loads.
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {
 def LBZ8 : DForm_1<34, (outs G8RC:$rD), (ins memri:$src),
-                  "lbz $rD, $src", LdStGeneral,
+                  "lbz $rD, $src", LdStLoad,
                   [(set G8RC:$rD, (zextloadi8 iaddr:$src))]>;
 def LHZ8 : DForm_1<40, (outs G8RC:$rD), (ins memri:$src),
-                  "lhz $rD, $src", LdStGeneral,
+                  "lhz $rD, $src", LdStLoad,
                   [(set G8RC:$rD, (zextloadi16 iaddr:$src))]>;
 def LWZ8 : DForm_1<32, (outs G8RC:$rD), (ins memri:$src),
-                  "lwz $rD, $src", LdStGeneral,
+                  "lwz $rD, $src", LdStLoad,
                   [(set G8RC:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
 
 def LBZX8 : XForm_1<31,  87, (outs G8RC:$rD), (ins memrr:$src),
-                   "lbzx $rD, $src", LdStGeneral,
+                   "lbzx $rD, $src", LdStLoad,
                    [(set G8RC:$rD, (zextloadi8 xaddr:$src))]>;
 def LHZX8 : XForm_1<31, 279, (outs G8RC:$rD), (ins memrr:$src),
-                   "lhzx $rD, $src", LdStGeneral,
+                   "lhzx $rD, $src", LdStLoad,
                    [(set G8RC:$rD, (zextloadi16 xaddr:$src))]>;
 def LWZX8 : XForm_1<31,  23, (outs G8RC:$rD), (ins memrr:$src),
-                   "lwzx $rD, $src", LdStGeneral,
+                   "lwzx $rD, $src", LdStLoad,
                    [(set G8RC:$rD, (zextloadi32 xaddr:$src))]>;
                    
                    
 // Update forms.
 let mayLoad = 1 in {
 def LBZU8 : DForm_1<35, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lbzu $rD, $addr", LdStGeneral,
+                    "lbzu $rD, $addr", LdStLoad,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LHZU8 : DForm_1<41, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lhzu $rD, $addr", LdStGeneral,
+                    "lhzu $rD, $addr", LdStLoad,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lwzu $rD, $addr", LdStGeneral,
+                    "lwzu $rD, $addr", LdStLoad,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 }
@@ -595,24 +613,24 @@ def : Pat<(PPCload xaddr:$src),
 let PPC970_Unit = 2 in {
 // Truncating stores.                       
 def STB8 : DForm_1<38, (outs), (ins G8RC:$rS, memri:$src),
-                   "stb $rS, $src", LdStGeneral,
+                   "stb $rS, $src", LdStStore,
                    [(truncstorei8 G8RC:$rS, iaddr:$src)]>;
 def STH8 : DForm_1<44, (outs), (ins G8RC:$rS, memri:$src),
-                   "sth $rS, $src", LdStGeneral,
+                   "sth $rS, $src", LdStStore,
                    [(truncstorei16 G8RC:$rS, iaddr:$src)]>;
 def STW8 : DForm_1<36, (outs), (ins G8RC:$rS, memri:$src),
-                   "stw $rS, $src", LdStGeneral,
+                   "stw $rS, $src", LdStStore,
                    [(truncstorei32 G8RC:$rS, iaddr:$src)]>;
 def STBX8 : XForm_8<31, 215, (outs), (ins G8RC:$rS, memrr:$dst),
-                   "stbx $rS, $dst", LdStGeneral,
+                   "stbx $rS, $dst", LdStStore,
                    [(truncstorei8 G8RC:$rS, xaddr:$dst)]>, 
                    PPC970_DGroup_Cracked;
 def STHX8 : XForm_8<31, 407, (outs), (ins G8RC:$rS, memrr:$dst),
-                   "sthx $rS, $dst", LdStGeneral,
+                   "sthx $rS, $dst", LdStStore,
                    [(truncstorei16 G8RC:$rS, xaddr:$dst)]>, 
                    PPC970_DGroup_Cracked;
 def STWX8 : XForm_8<31, 151, (outs), (ins G8RC:$rS, memrr:$dst),
-                   "stwx $rS, $dst", LdStGeneral,
+                   "stwx $rS, $dst", LdStStore,
                    [(truncstorei32 G8RC:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
 // Normal 8-byte stores.
@@ -629,14 +647,14 @@ let PPC970_Unit = 2 in {
 
 def STBU8 : DForm_1a<38, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 707fa41..6c0f3d3 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -188,85 +188,85 @@ class VX2_Int<bits<11> xo, string opc, Intrinsic IntID>
 
 def DSS      : DSS_Form<822, (outs),
                         (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dss $STRM", LdStGeneral /*FIXME*/, []>;
+                        "dss $STRM", LdStLoad /*FIXME*/, []>;
 def DSSALL   : DSS_Form<822, (outs),
                         (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dssall", LdStGeneral /*FIXME*/, []>;
+                        "dssall", LdStLoad /*FIXME*/, []>;
 def DST      : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
-                        "dst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
 def DSTT     : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
-                        "dstt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
 def DSTST    : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
-                        "dstst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
 def DSTSTT   : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
-                        "dststt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
 
 def DST64    : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
-                        "dst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
 def DSTT64   : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
-                        "dstt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
 def DSTST64  : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
-                        "dstst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
 def DSTSTT64 : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
-                        "dststt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
 
 def MFVSCR : VXForm_4<1540, (outs VRRC:$vD), (ins),
-                      "mfvscr $vD", LdStGeneral,
+                      "mfvscr $vD", LdStStore,
                       [(set VRRC:$vD, (int_ppc_altivec_mfvscr))]>; 
 def MTVSCR : VXForm_5<1604, (outs), (ins VRRC:$vB),
-                      "mtvscr $vB", LdStGeneral,
+                      "mtvscr $vB", LdStLoad,
                       [(int_ppc_altivec_mtvscr VRRC:$vB)]>; 
 
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {  // Loads.
 def LVEBX: XForm_1<31,   7, (outs VRRC:$vD), (ins memrr:$src),
-                   "lvebx $vD, $src", LdStGeneral,
+                   "lvebx $vD, $src", LdStLoad,
                    [(set VRRC:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
 def LVEHX: XForm_1<31,  39, (outs VRRC:$vD), (ins memrr:$src),
-                   "lvehx $vD, $src", LdStGeneral,
+                   "lvehx $vD, $src", LdStLoad,
                    [(set VRRC:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
 def LVEWX: XForm_1<31,  71, (outs VRRC:$vD), (ins memrr:$src),
-                   "lvewx $vD, $src", LdStGeneral,
+                   "lvewx $vD, $src", LdStLoad,
                    [(set VRRC:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
 def LVX  : XForm_1<31, 103, (outs VRRC:$vD), (ins memrr:$src),
-                   "lvx $vD, $src", LdStGeneral,
+                   "lvx $vD, $src", LdStLoad,
                    [(set VRRC:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
 def LVXL : XForm_1<31, 359, (outs VRRC:$vD), (ins memrr:$src),
-                   "lvxl $vD, $src", LdStGeneral,
+                   "lvxl $vD, $src", LdStLoad,
                    [(set VRRC:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
 }
 
 def LVSL : XForm_1<31,   6, (outs VRRC:$vD), (ins memrr:$src),
-                   "lvsl $vD, $src", LdStGeneral,
+                   "lvsl $vD, $src", LdStLoad,
                    [(set VRRC:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
                    PPC970_Unit_LSU;
 def LVSR : XForm_1<31,  38, (outs VRRC:$vD), (ins memrr:$src),
-                   "lvsr $vD, $src", LdStGeneral,
+                   "lvsr $vD, $src", LdStLoad,
                    [(set VRRC:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
                    PPC970_Unit_LSU;
 
 let PPC970_Unit = 2 in {   // Stores.
 def STVEBX: XForm_8<31, 135, (outs), (ins VRRC:$rS, memrr:$dst),
-                   "stvebx $rS, $dst", LdStGeneral,
+                   "stvebx $rS, $dst", LdStStore,
                    [(int_ppc_altivec_stvebx VRRC:$rS, xoaddr:$dst)]>;
 def STVEHX: XForm_8<31, 167, (outs), (ins VRRC:$rS, memrr:$dst),
-                   "stvehx $rS, $dst", LdStGeneral,
+                   "stvehx $rS, $dst", LdStStore,
                    [(int_ppc_altivec_stvehx VRRC:$rS, xoaddr:$dst)]>;
 def STVEWX: XForm_8<31, 199, (outs), (ins VRRC:$rS, memrr:$dst),
-                   "stvewx $rS, $dst", LdStGeneral,
+                   "stvewx $rS, $dst", LdStStore,
                    [(int_ppc_altivec_stvewx VRRC:$rS, xoaddr:$dst)]>;
 def STVX  : XForm_8<31, 231, (outs), (ins VRRC:$rS, memrr:$dst),
-                   "stvx $rS, $dst", LdStGeneral,
+                   "stvx $rS, $dst", LdStStore,
                    [(int_ppc_altivec_stvx VRRC:$rS, xoaddr:$dst)]>;
 def STVXL : XForm_8<31, 487, (outs), (ins VRRC:$rS, memrr:$dst),
-                   "stvxl $rS, $dst", LdStGeneral,
+                   "stvxl $rS, $dst", LdStStore,
                    [(int_ppc_altivec_stvxl VRRC:$rS, xoaddr:$dst)]>;
 }
 
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index d332e2a..d8e4b2b 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -51,6 +51,36 @@ class PPC970_Unit_VALU     { bits<3> PPC970_Unit = 5;   }
 class PPC970_Unit_VPERM    { bits<3> PPC970_Unit = 6;   }
 class PPC970_Unit_BRU      { bits<3> PPC970_Unit = 7;   }
 
+// Two joined instructions; used to emit two adjacent instructions as one.
+// The itinerary from the first instruction is used for scheduling and
+// classification.
+class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
+         InstrItinClass itin>
+        : Instruction {
+  field bits<64> Inst;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode1;
+  let Inst{32-37} = opcode2;
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  let TSFlags{0}   = PPC970_First;
+  let TSFlags{1}   = PPC970_Single;
+  let TSFlags{2}   = PPC970_Cracked;
+  let TSFlags{5-3} = PPC970_Unit;
+}
 
 // 1.7.1 I-Form
 class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
@@ -164,6 +194,35 @@ class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
   let Addr = 0;
 }
 
+class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+            dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<21> Addr;
+
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+
+  let Inst{38-42}  = A;
+  let Inst{43-47} = Addr{20-16}; // Base Reg
+  let Inst{48-63} = Addr{15-0};  // Displacement
+}
+
+// This is used to emit BL8+NOP.
+class IForm_and_DForm_4_zero<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+            dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         :  IForm_and_DForm_1<opcode1, aa, lk, opcode2,
+                              OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let Addr = 0;
+}
+
 class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin>
   : I<opcode, OOL, IOL, asmstr, itin> {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 7a8ec40..b45ada9 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -49,9 +50,9 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
   const TargetMachine *TM,
   const ScheduleDAG *DAG) const {
   unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
-  if (Directive == PPC::DIR_440) {
+  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2) {
     const InstrItineraryData *II = TM->getInstrItineraryData();
-    return new PPCHazardRecognizer440(II, DAG);
+    return new PPCScoreboardHazardRecognizer(II, DAG);
   }
 
   return TargetInstrInfoImpl::CreateTargetHazardRecognizer(TM, DAG);
@@ -65,14 +66,14 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
   unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
   // Most subtargets use a PPC970 recognizer.
-  if (Directive != PPC::DIR_440) {
+  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2) {
     const TargetInstrInfo *TII = TM.getInstrInfo();
     assert(TII && "No InstrInfo?");
 
     return new PPCHazardRecognizer970(*TII);
   }
 
-  return TargetInstrInfoImpl::CreateTargetPostRAHazardRecognizer(II, DAG);
+  return new PPCScoreboardHazardRecognizer(II, DAG);
 }
 unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                            int &FrameIndex) const {
@@ -684,6 +685,9 @@ unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   case PPC::GC_LABEL:
   case PPC::DBG_VALUE:
     return 0;
+  case PPC::BL8_NOP_ELF:
+  case PPC::BLA8_NOP_ELF:
+    return 8;
   default:
     return 4; // PowerPC instructions are all 4 bytes
   }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 939b71a..748486c 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -116,6 +116,9 @@ def PPCcall_Darwin : SDNode<"PPCISD::CALL_Darwin", SDT_PPCCall,
 def PPCcall_SVR4  : SDNode<"PPCISD::CALL_SVR4", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                             SDNPVariadic]>;
+def PPCcall_nop_SVR4  : SDNode<"PPCISD::CALL_NOP_SVR4", SDT_PPCCall,
+                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                SDNPVariadic]>;
 def PPCnop : SDNode<"PPCISD::NOP", SDT_PPCnop, [SDNPInGlue, SDNPOutGlue]>;
 def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
@@ -542,6 +545,9 @@ def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst),
                       "dcbzl $dst", LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
 
+def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
+          (DCBT xoaddr:$dst)>;
+
 // Atomic operations
 let usesCustomInserter = 1 in {
   let Defs = [CR0] in {
@@ -637,7 +643,7 @@ def STWCX : XForm_1<31, 150, (outs), (ins GPRC:$rS, memrr:$dst),
                    isDOT;
 
 let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
-def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", LdStGeneral, [(trap)]>;
+def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", LdStLoad, [(trap)]>;
 
 //===----------------------------------------------------------------------===//
 // PPC32 Load Instructions.
@@ -646,17 +652,17 @@ def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", LdStGeneral, [(trap)]>;
 // Unindexed (r+i) Loads. 
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {
 def LBZ : DForm_1<34, (outs GPRC:$rD), (ins memri:$src),
-                  "lbz $rD, $src", LdStGeneral,
+                  "lbz $rD, $src", LdStLoad,
                   [(set GPRC:$rD, (zextloadi8 iaddr:$src))]>;
 def LHA : DForm_1<42, (outs GPRC:$rD), (ins memri:$src),
                   "lha $rD, $src", LdStLHA,
                   [(set GPRC:$rD, (sextloadi16 iaddr:$src))]>,
                   PPC970_DGroup_Cracked;
 def LHZ : DForm_1<40, (outs GPRC:$rD), (ins memri:$src),
-                  "lhz $rD, $src", LdStGeneral,
+                  "lhz $rD, $src", LdStLoad,
                   [(set GPRC:$rD, (zextloadi16 iaddr:$src))]>;
 def LWZ : DForm_1<32, (outs GPRC:$rD), (ins memri:$src),
-                  "lwz $rD, $src", LdStGeneral,
+                  "lwz $rD, $src", LdStLoad,
                   [(set GPRC:$rD, (load iaddr:$src))]>;
 
 def LFS : DForm_1<48, (outs F4RC:$rD), (ins memri:$src),
@@ -670,22 +676,22 @@ def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
 // Unindexed (r+i) Loads with Update (preinc).
 let mayLoad = 1 in {
 def LBZU : DForm_1<35, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lbzu $rD, $addr", LdStGeneral,
+                   "lbzu $rD, $addr", LdStLoad,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAU : DForm_1<43, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhau $rD, $addr", LdStGeneral,
+                   "lhau $rD, $addr", LdStLoad,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHZU : DForm_1<41, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhzu $rD, $addr", LdStGeneral,
+                   "lhzu $rD, $addr", LdStLoad,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZU : DForm_1<33, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lwzu $rD, $addr", LdStGeneral,
+                   "lwzu $rD, $addr", LdStLoad,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
@@ -705,25 +711,25 @@ def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
 //
 let canFoldAsLoad = 1, PPC970_Unit = 2 in {
 def LBZX : XForm_1<31,  87, (outs GPRC:$rD), (ins memrr:$src),
-                   "lbzx $rD, $src", LdStGeneral,
+                   "lbzx $rD, $src", LdStLoad,
                    [(set GPRC:$rD, (zextloadi8 xaddr:$src))]>;
 def LHAX : XForm_1<31, 343, (outs GPRC:$rD), (ins memrr:$src),
                    "lhax $rD, $src", LdStLHA,
                    [(set GPRC:$rD, (sextloadi16 xaddr:$src))]>,
                    PPC970_DGroup_Cracked;
 def LHZX : XForm_1<31, 279, (outs GPRC:$rD), (ins memrr:$src),
-                   "lhzx $rD, $src", LdStGeneral,
+                   "lhzx $rD, $src", LdStLoad,
                    [(set GPRC:$rD, (zextloadi16 xaddr:$src))]>;
 def LWZX : XForm_1<31,  23, (outs GPRC:$rD), (ins memrr:$src),
-                   "lwzx $rD, $src", LdStGeneral,
+                   "lwzx $rD, $src", LdStLoad,
                    [(set GPRC:$rD, (load xaddr:$src))]>;
                    
                    
 def LHBRX : XForm_1<31, 790, (outs GPRC:$rD), (ins memrr:$src),
-                   "lhbrx $rD, $src", LdStGeneral,
+                   "lhbrx $rD, $src", LdStLoad,
                    [(set GPRC:$rD, (PPClbrx xoaddr:$src, i16))]>;
 def LWBRX : XForm_1<31,  534, (outs GPRC:$rD), (ins memrr:$src),
-                   "lwbrx $rD, $src", LdStGeneral,
+                   "lwbrx $rD, $src", LdStLoad,
                    [(set GPRC:$rD, (PPClbrx xoaddr:$src, i32))]>;
 
 def LFSX   : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src),
@@ -741,13 +747,13 @@ def LFDX   : XForm_25<31, 599, (outs F8RC:$frD), (ins memrr:$src),
 // Unindexed (r+i) Stores.
 let PPC970_Unit = 2 in {
 def STB  : DForm_1<38, (outs), (ins GPRC:$rS, memri:$src),
-                   "stb $rS, $src", LdStGeneral,
+                   "stb $rS, $src", LdStStore,
                    [(truncstorei8 GPRC:$rS, iaddr:$src)]>;
 def STH  : DForm_1<44, (outs), (ins GPRC:$rS, memri:$src),
-                   "sth $rS, $src", LdStGeneral,
+                   "sth $rS, $src", LdStStore,
                    [(truncstorei16 GPRC:$rS, iaddr:$src)]>;
 def STW  : DForm_1<36, (outs), (ins GPRC:$rS, memri:$src),
-                   "stw $rS, $src", LdStGeneral,
+                   "stw $rS, $src", LdStStore,
                    [(store GPRC:$rS, iaddr:$src)]>;
 def STFS : DForm_1<52, (outs), (ins F4RC:$rS, memri:$dst),
                    "stfs $rS, $dst", LdStUX,
@@ -761,33 +767,33 @@ def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
 let PPC970_Unit = 2 in {
 def STBU  : DForm_1a<39, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU  : DForm_1a<45, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STWU  : DForm_1a<37, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
                     [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, 
                                                      iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFSU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F4RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfsu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    "stfsu $rS, $ptroff($ptrreg)", LdStStore,
                     [(set ptr_rc:$ea_res, (pre_store F4RC:$rS,  ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFDU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfdu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    "stfdu $rS, $ptroff($ptrreg)", LdStStore,
                     [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
@@ -798,29 +804,29 @@ def STFDU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F8RC:$rS,
 //
 let PPC970_Unit = 2 in {
 def STBX  : XForm_8<31, 215, (outs), (ins GPRC:$rS, memrr:$dst),
-                   "stbx $rS, $dst", LdStGeneral,
+                   "stbx $rS, $dst", LdStStore,
                    [(truncstorei8 GPRC:$rS, xaddr:$dst)]>, 
                    PPC970_DGroup_Cracked;
 def STHX  : XForm_8<31, 407, (outs), (ins GPRC:$rS, memrr:$dst),
-                   "sthx $rS, $dst", LdStGeneral,
+                   "sthx $rS, $dst", LdStStore,
                    [(truncstorei16 GPRC:$rS, xaddr:$dst)]>, 
                    PPC970_DGroup_Cracked;
 def STWX  : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst),
-                   "stwx $rS, $dst", LdStGeneral,
+                   "stwx $rS, $dst", LdStStore,
                    [(store GPRC:$rS, xaddr:$dst)]>,
                    PPC970_DGroup_Cracked;
                    
 let mayStore = 1 in {
 def STWUX : XForm_8<31, 183, (outs), (ins GPRC:$rS, GPRC:$rA, GPRC:$rB),
-                   "stwux $rS, $rA, $rB", LdStGeneral,
+                   "stwux $rS, $rA, $rB", LdStStore,
                    []>;
 }
 def STHBRX: XForm_8<31, 918, (outs), (ins GPRC:$rS, memrr:$dst),
-                   "sthbrx $rS, $dst", LdStGeneral,
+                   "sthbrx $rS, $dst", LdStStore,
                    [(PPCstbrx GPRC:$rS, xoaddr:$dst, i16)]>, 
                    PPC970_DGroup_Cracked;
 def STWBRX: XForm_8<31, 662, (outs), (ins GPRC:$rS, memrr:$dst),
-                   "stwbrx $rS, $dst", LdStGeneral,
+                   "stwbrx $rS, $dst", LdStStore,
                    [(PPCstbrx GPRC:$rS, xoaddr:$dst, i32)]>,
                    PPC970_DGroup_Cracked;
 
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index 4590f00..a6528c0 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -291,9 +291,10 @@ void PPC64CompilationCallback() {
 }
 #endif
 
-extern "C" void *PPCCompilationCallbackC(unsigned *StubCallAddrPlus4,
-                                         unsigned *OrigCallAddrPlus4,
-                                         bool is64Bit) {
+extern "C" {
+static void* LLVM_ATTRIBUTE_USED PPCCompilationCallbackC(unsigned *StubCallAddrPlus4,
+                                                         unsigned *OrigCallAddrPlus4,
+                                                         bool is64Bit) {
   // Adjust the pointer to the address of the call instruction in the stub
   // emitted by emitFunctionStub, rather than the instruction after it.
   unsigned *StubCallAddr = StubCallAddrPlus4 - 1;
@@ -337,6 +338,7 @@ extern "C" void *PPCCompilationCallbackC(unsigned *StubCallAddrPlus4,
   // stack after we restore all regs.
   return Target;
 }
+}
 
 
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 2976f01..ef13571 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -554,7 +554,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // clear can be encoded.  This is extremely uncommon, because normally you
   // only "std" to a stack slot that is at least 4-byte aligned, but it can
   // happen in invalid code.
-  if (isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
+  if (OpC == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
+      (isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0))) {
     if (isIXAddr)
       Offset >>= 2;    // The actual encoded value has the low two bits zero.
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 4e37d0a..8c0a858 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -50,7 +50,8 @@ def BrMCRX       : InstrItinClass;
 def LdStDCBA     : InstrItinClass;
 def LdStDCBF     : InstrItinClass;
 def LdStDCBI     : InstrItinClass;
-def LdStGeneral  : InstrItinClass;
+def LdStLoad     : InstrItinClass;
+def LdStStore    : InstrItinClass;
 def LdStDSS      : InstrItinClass;
 def LdStICBI     : InstrItinClass;
 def LdStUX       : InstrItinClass;
@@ -107,6 +108,7 @@ include "PPCSchedule440.td"
 include "PPCScheduleG4.td"
 include "PPCScheduleG4Plus.td"
 include "PPCScheduleG5.td"
+include "PPCScheduleA2.td"
 
 //===----------------------------------------------------------------------===//
 // Instruction to itinerary class map - When add new opcodes to the supported
@@ -150,8 +152,8 @@ include "PPCScheduleG5.td"
 //    dcbf       LdStDCBF
 //    dcbi       LdStDCBI
 //    dcbst      LdStDCBF
-//    dcbt       LdStGeneral
-//    dcbtst     LdStGeneral
+//    dcbt       LdStLoad
+//    dcbtst     LdStLoad
 //    dcbz       LdStDCBF
 //    divd       IntDivD
 //    divdu      IntDivD
@@ -160,9 +162,9 @@ include "PPCScheduleG5.td"
 //    dss        LdStDSS
 //    dst        LdStDSS
 //    dstst      LdStDSS
-//    eciwx      LdStGeneral
-//    ecowx      LdStGeneral
-//    eieio      LdStGeneral
+//    eciwx      LdStLoad
+//    ecowx      LdStLoad
+//    eieio      LdStLoad
 //    eqv        IntGeneral
 //    extsb      IntGeneral
 //    extsh      IntGeneral
@@ -202,10 +204,10 @@ include "PPCScheduleG5.td"
 //    fsubs      FPGeneral
 //    icbi       LdStICBI
 //    isync      SprISYNC
-//    lbz        LdStGeneral
-//    lbzu       LdStGeneral
+//    lbz        LdStLoad
+//    lbzu       LdStLoad
 //    lbzux      LdStUX
-//    lbzx       LdStGeneral
+//    lbzx       LdStLoad
 //    ld         LdStLD
 //    ldarx      LdStLDARX
 //    ldu        LdStLD
@@ -223,11 +225,11 @@ include "PPCScheduleG5.td"
 //    lhau       LdStLHA
 //    lhaux      LdStLHA
 //    lhax       LdStLHA
-//    lhbrx      LdStGeneral
-//    lhz        LdStGeneral
-//    lhzu       LdStGeneral
+//    lhbrx      LdStLoad
+//    lhz        LdStLoad
+//    lhzu       LdStLoad
 //    lhzux      LdStUX
-//    lhzx       LdStGeneral
+//    lhzx       LdStLoad
 //    lmw        LdStLMW
 //    lswi       LdStLMW
 //    lswx       LdStLMW
@@ -242,11 +244,11 @@ include "PPCScheduleG5.td"
 //    lwarx      LdStLWARX
 //    lwaux      LdStLHA
 //    lwax       LdStLHA
-//    lwbrx      LdStGeneral
-//    lwz        LdStGeneral
-//    lwzu       LdStGeneral
+//    lwbrx      LdStLoad
+//    lwz        LdStLoad
+//    lwzu       LdStLoad
 //    lwzux      LdStUX
-//    lwzx       LdStGeneral
+//    lwzx       LdStLoad
 //    mcrf       BrMCR
 //    mcrfs      FPGeneral
 //    mcrxr      BrMCRX
@@ -307,10 +309,10 @@ include "PPCScheduleG5.td"
 //    srawi      IntShift
 //    srd        IntRotateD
 //    srw        IntGeneral
-//    stb        LdStGeneral
-//    stbu       LdStGeneral
-//    stbux      LdStGeneral
-//    stbx       LdStGeneral
+//    stb        LdStStore
+//    stbu       LdStStore
+//    stbux      LdStStore
+//    stbx       LdStStore
 //    std        LdStSTD
 //    stdcx.     LdStSTDCX
 //    stdu       LdStSTD
@@ -325,11 +327,11 @@ include "PPCScheduleG5.td"
 //    stfsu      LdStUX
 //    stfsux     LdStUX
 //    stfsx      LdStUX
-//    sth        LdStGeneral
-//    sthbrx     LdStGeneral
-//    sthu       LdStGeneral
-//    sthux      LdStGeneral
-//    sthx       LdStGeneral
+//    sth        LdStStore
+//    sthbrx     LdStStore
+//    sthu       LdStStore
+//    sthux      LdStStore
+//    sthx       LdStStore
 //    stmw       LdStLMW
 //    stswi      LdStLMW
 //    stswx      LdStLMW
@@ -338,12 +340,12 @@ include "PPCScheduleG5.td"
 //    stvewx     LdStSTVEBX
 //    stvx       LdStSTVEBX
 //    stvxl      LdStSTVEBX
-//    stw        LdStGeneral
-//    stwbrx     LdStGeneral
+//    stw        LdStStore
+//    stwbrx     LdStStore
 //    stwcx.     LdStSTWCX
-//    stwu       LdStGeneral
-//    stwux      LdStGeneral
-//    stwx       LdStGeneral
+//    stwu       LdStStore
+//    stwux      LdStStore
+//    stwx       LdStStore
 //    subf       IntGeneral
 //    subfc      IntGeneral
 //    subfe      IntGeneral
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index 76f7465..419faea 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -270,15 +270,23 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStGeneral , [InstrStage<1, [IFTH1, IFTH2]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
                                InstrStage<1, [LRACC]>,
                                InstrStage<1, [AGEN]>,
                                InstrStage<1, [CRD]>,
                                InstrStage<2, [LWB]>],
-                              [9, 5], // FIXME: should be [9, 5] for loads and
-                                       // [8, 5] for stores.
+                              [9, 5],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStStore   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
                               [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStICBI    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
@@ -345,6 +353,46 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1]>,
+                               InstrStage<1, [IRACC], 0>,
+                               InstrStage<4, [LWARX_Hold], 0>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1]>,
+                               InstrStage<1, [IRACC], 0>,
+                               InstrStage<4, [LWARX_Hold], 0>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStSTWCX   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1]>,
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
new file mode 100644
index 0000000..857ba40
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -0,0 +1,652 @@
+//===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// A2 Processor User's Manual.
+// IBM (as updated in) 2010.
+
+//===----------------------------------------------------------------------===//
+// Functional units on the PowerPC A2 chip sets
+//
+def IU0to3_0  : FuncUnit; // Fetch unit 1 to 4 slot 1
+def IU0to3_1  : FuncUnit; // Fetch unit 1 to 4 slot 2
+def IU0to3_2  : FuncUnit; // Fetch unit 1 to 4 slot 3
+def IU0to3_3  : FuncUnit; // Fetch unit 1 to 4 slot 4
+def IU4_0  : FuncUnit; // Instruction buffer slot 1
+def IU4_1  : FuncUnit; // Instruction buffer slot 2
+def IU4_2  : FuncUnit; // Instruction buffer slot 3
+def IU4_3  : FuncUnit; // Instruction buffer slot 4
+def IU4_4  : FuncUnit; // Instruction buffer slot 5
+def IU4_5  : FuncUnit; // Instruction buffer slot 6
+def IU4_6  : FuncUnit; // Instruction buffer slot 7
+def IU4_7  : FuncUnit; // Instruction buffer slot 8
+def IU5    : FuncUnit; // Dependency resolution
+def IU6    : FuncUnit; // Instruction issue
+def RF0    : FuncUnit;
+def XRF1   : FuncUnit;
+def XEX1   : FuncUnit; // Execution stage 1 for the XU pipeline
+def XEX2   : FuncUnit; // Execution stage 2 for the XU pipeline
+def XEX3   : FuncUnit; // Execution stage 3 for the XU pipeline
+def XEX4   : FuncUnit; // Execution stage 4 for the XU pipeline
+def XEX5   : FuncUnit; // Execution stage 5 for the XU pipeline
+def XEX6   : FuncUnit; // Execution stage 6 for the XU pipeline
+def FRF1   : FuncUnit;
+def FEX1   : FuncUnit; // Execution stage 1 for the FU pipeline
+def FEX2   : FuncUnit; // Execution stage 2 for the FU pipeline
+def FEX3   : FuncUnit; // Execution stage 3 for the FU pipeline
+def FEX4   : FuncUnit; // Execution stage 4 for the FU pipeline
+def FEX5   : FuncUnit; // Execution stage 5 for the FU pipeline
+def FEX6   : FuncUnit; // Execution stage 6 for the FU pipeline
+
+def CR_Bypass  : Bypass; // The bypass for condition regs.
+//def GPR_Bypass : Bypass; // The bypass for general-purpose regs.
+//def FPR_Bypass : Bypass; // The bypass for floating-point regs.
+
+//
+// This file defines the itinerary class data for the PPC A2 processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def PPCA2Itineraries : ProcessorItineraries<
+  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3,
+   IU4_0, IU4_1, IU4_2, IU4_3, IU4_4, IU4_5, IU4_6, IU4_7,
+   IU5, IU6, RF0, XRF1, XEX1, XEX2, XEX3, XEX4, XEX5, XEX6,
+   FRF1, FEX1, FEX2, FEX3, FEX4, FEX5, FEX6],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntGeneral  , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivW     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<38, [XEX6]>],
+                              [53, 7, 7],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7], 
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHW    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntShift    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7], 
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 11],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 11],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 11],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStStore   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStICBI    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStUX      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<LdStLFD     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLFDU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHA     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLMW     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [26, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [26, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [26, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [26, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<12, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [GPR_Bypass, NoBypass]>,
+  InstrItinData<SprMTMSR    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSR     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7], 
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [GPR_Bypass, NoBypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
+                              [29, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
+                              [29, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprRFI      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
+                              [29, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprSC       , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
+                              [29, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7, 7],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPCompare   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [13, 7, 7],
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<71, [FRF1], 0>,
+                               InstrStage<71, [FEX1], 0>,
+                                  InstrStage<71, [FEX2], 0>,
+                               InstrStage<71, [FEX3], 0>,
+                                  InstrStage<71, [FEX4], 0>,
+                               InstrStage<71, [FEX5], 0>,
+                                  InstrStage<71, [FEX6]>],
+                              [86, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<58, [FRF1], 0>,
+                               InstrStage<58, [FEX1], 0>,
+                                  InstrStage<58, [FEX2], 0>,
+                               InstrStage<58, [FEX3], 0>,
+                                  InstrStage<58, [FEX4], 0>,
+                               InstrStage<58, [FEX5], 0>,
+                                  InstrStage<58, [FEX6]>],
+                              [73, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPSqrt      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<68, [FRF1], 0>,
+                               InstrStage<68, [FEX1], 0>,
+                                  InstrStage<68, [FEX2], 0>,
+                               InstrStage<68, [FEX3], 0>,
+                                  InstrStage<68, [FEX4], 0>,
+                               InstrStage<68, [FEX5], 0>,
+                                  InstrStage<68, [FEX6]>],
+                              [86, 7], // FIXME: should be [86, 7] for double
+                                       // and [82, 7] for single. Likewise,
+                                       // the FEX? cycle count should be 68
+                                       // for double and 64 for single.
+                              [NoBypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7, 7, 7],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7],
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
index e7e5498..bc926f7 100644
--- a/lib/Target/PowerPC/PPCScheduleG3.td
+++ b/lib/Target/PowerPC/PPCScheduleG3.td
@@ -32,7 +32,8 @@ def G3Itineraries : ProcessorItineraries<
   InstrItinData<LdStDCBA    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
index 87a3151..f7ec1e0 100644
--- a/lib/Target/PowerPC/PPCScheduleG4.td
+++ b/lib/Target/PowerPC/PPCScheduleG4.td
@@ -31,7 +31,8 @@ def G4Itineraries : ProcessorItineraries<
   InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
   InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
index f76557a..37ebfc5 100644
--- a/lib/Target/PowerPC/PPCScheduleG4Plus.td
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -34,7 +34,8 @@ def G4PlusItineraries : ProcessorItineraries<
   InstrItinData<BrMCRX      , [InstrStage<2, [IU2]>]>,
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
   InstrItinData<LdStUX      , [InstrStage<3, [SLU]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index bc0820b..d1e40ce 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -35,7 +35,8 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<BrMCR       , [InstrStage<2, [BPU]>]>,
   InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
   InstrItinData<LdStUX      , [InstrStage<4, [SLU]>]>,
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index c89fab3..f405b47 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -146,10 +146,11 @@ bool PPCSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  if (DarwinDirective == PPC::DIR_440)
-    return false;
+  if (DarwinDirective == PPC::DIR_440 || DarwinDirective == PPC::DIR_A2)
+    Mode = TargetSubtargetInfo::ANTIDEP_ALL;
+  else
+    Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
 
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
   CriticalPathRCs.clear();
 
   if (isPPC64())
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 69fe50b..a275029 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -40,6 +40,7 @@ namespace PPC {
     DIR_7400,
     DIR_750, 
     DIR_970, 
+    DIR_A2,
     DIR_64  
   };
 }
@@ -144,6 +145,8 @@ public:
 
   /// isDarwin - True if this is any darwin platform.
   bool isDarwin() const { return TargetTriple.isMacOSX(); }
+  /// isBGP - True if this is a BG/P platform.
+  bool isBGP() const { return TargetTriple.getVendor() == Triple::BGP; }
 
   bool isDarwinABI() const { return isDarwin(); }
   bool isSVR4ABI() const { return !isDarwin(); }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index ba9c779..d113976 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -39,6 +39,10 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget), JITInfo(*this, is64Bit),
     TLInfo(*this), TSInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()) {
+
+  // The binutils for the BG/P are too old for CFI.
+  if (Subtarget.isBGP())
+    setMCUseCFI(false);
 }
 
 void PPC32TargetMachine::anchor() { }
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 1f69ffb..093255e 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2,22 +2,6 @@ Target Independent Opportunities:
 
 //===---------------------------------------------------------------------===//
 
-With the recent changes to make the implicit def/use set explicit in
-machineinstrs, we should change the target descriptions for 'call' instructions
-so that the .td files don't list all the call-clobbered registers as implicit
-defs.  Instead, these should be added by the code generator (e.g. on the dag).
-
-This has a number of uses:
-
-1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
-   for their different impdef sets.
-2. Targets with multiple calling convs (e.g. x86) which have different clobber
-   sets don't need copies of call instructions.
-3. 'Interprocedural register allocation' can be done to reduce the clobber sets
-   of calls.
-
-//===---------------------------------------------------------------------===//
-
 We should recognized various "overflow detection" idioms and translate them into
 llvm.uadd.with.overflow and similar intrinsics.  Here is a multiply idiom:
 
@@ -961,6 +945,25 @@ optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 
 //===---------------------------------------------------------------------===//
 
+int g(int x) { return (x - 10) < 0; }
+Should combine to "x <= 9" (the sub has nsw).  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int g(int x) { return (x + 10) < 0; }
+Should combine to "x < -10" (the add has nsw).  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int f(int i, int j) { return i < j + 1; }
+int g(int i, int j) { return j > i - 1; }
+Should combine to "i <= j" (the add/sub has nsw).  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
 This was noticed in the entryblock for grokdeclarator in 403.gcc:
 
         %tmp = icmp eq i32 %decl_context, 4          
@@ -2358,3 +2361,8 @@ unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
 should fold to x > y.
 
 //===---------------------------------------------------------------------===//
+
+int f(double x) { return __builtin_fabs(x) < 0.0; }
+should fold to false.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 616e1c5..f0e1354 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -14,10 +14,10 @@
 #ifndef SPARCTARGETASMINFO_H
 #define SPARCTARGETASMINFO_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
+  class StringRef;
   class Target;
 
   class SparcELFMCAsmInfo : public MCAsmInfo {
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index 2fd9e3f..cba775a 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -15,9 +15,7 @@
 #define SPARCMCTARGETDESC_H
 
 namespace llvm {
-class MCSubtargetInfo;
 class Target;
-class StringRef;
 
 extern Target TheSparcTarget;
 extern Target TheSparcV9Target;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index ee12633..c3e6f16 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -832,22 +832,19 @@ const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
 /// be zero. Op is expected to be a target specific node. Used by DAG
 /// combiner.
 void SparcTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                         const APInt &Mask,
                                                          APInt &KnownZero,
                                                          APInt &KnownOne,
                                                          const SelectionDAG &DAG,
                                                          unsigned Depth) const {
   APInt KnownZero2, KnownOne2;
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
 
   switch (Op.getOpcode()) {
   default: break;
   case SPISD::SELECT_ICC:
   case SPISD::SELECT_FCC:
-    DAG.ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne,
-                          Depth+1);
-    DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero2, KnownOne2,
-                          Depth+1);
+    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
     assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
 
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index f483c96..cf43048 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -50,7 +50,6 @@ namespace llvm {
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
     virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                const APInt &Mask,
                                                 APInt &KnownZero,
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index 3acb4dd..acb7476 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -373,7 +373,7 @@ unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType,
       // If the alignment is not a power of 2, round up to the next power of 2.
       // This happens for non-power-of-2 length vectors.
       if (Align & (Align-1))
-        Align = llvm::NextPowerOf2(Align);
+        Align = NextPowerOf2(Align);
       return Align;
     }
   }
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 269958f..ec95ad4 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -56,7 +56,7 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "exp2f",
     "expm1",
     "expm1l",
-    "expl1f",
+    "expm1f",
     "fabs",
     "fabsl",
     "fabsf",
@@ -95,6 +95,9 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "rint",
     "rintf",
     "rintl",
+    "round",
+    "roundf",
+    "roundl",
     "sin",
     "sinl",
     "sinf",
@@ -155,6 +158,81 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T) {
     TLI.setUnavailable(LibFunc::siprintf);
     TLI.setUnavailable(LibFunc::fiprintf);
   }
+
+  if (T.getOS() == Triple::Win32) {
+    // Win32 does not support long double
+    TLI.setUnavailable(LibFunc::acosl);
+    TLI.setUnavailable(LibFunc::asinl);
+    TLI.setUnavailable(LibFunc::atanl);
+    TLI.setUnavailable(LibFunc::atan2l);
+    TLI.setUnavailable(LibFunc::ceill);
+    TLI.setUnavailable(LibFunc::copysignl);
+    TLI.setUnavailable(LibFunc::cosl);
+    TLI.setUnavailable(LibFunc::coshl);
+    TLI.setUnavailable(LibFunc::expl);
+    TLI.setUnavailable(LibFunc::fabsf); // Win32 and Win64 both lack fabsf
+    TLI.setUnavailable(LibFunc::fabsl);
+    TLI.setUnavailable(LibFunc::floorl);
+    TLI.setUnavailable(LibFunc::fmodl);
+    TLI.setUnavailable(LibFunc::logl);
+    TLI.setUnavailable(LibFunc::powl);
+    TLI.setUnavailable(LibFunc::sinl);
+    TLI.setUnavailable(LibFunc::sinhl);
+    TLI.setUnavailable(LibFunc::sqrtl);
+    TLI.setUnavailable(LibFunc::tanl);
+    TLI.setUnavailable(LibFunc::tanhl);
+
+    // Win32 only has C89 math
+    TLI.setUnavailable(LibFunc::exp2);
+    TLI.setUnavailable(LibFunc::exp2f);
+    TLI.setUnavailable(LibFunc::exp2l);
+    TLI.setUnavailable(LibFunc::expm1);
+    TLI.setUnavailable(LibFunc::expm1f);
+    TLI.setUnavailable(LibFunc::expm1l);
+    TLI.setUnavailable(LibFunc::log2);
+    TLI.setUnavailable(LibFunc::log2f);
+    TLI.setUnavailable(LibFunc::log2l);
+    TLI.setUnavailable(LibFunc::log1p);
+    TLI.setUnavailable(LibFunc::log1pf);
+    TLI.setUnavailable(LibFunc::log1pl);
+    TLI.setUnavailable(LibFunc::nearbyint);
+    TLI.setUnavailable(LibFunc::nearbyintf);
+    TLI.setUnavailable(LibFunc::nearbyintl);
+    TLI.setUnavailable(LibFunc::rint);
+    TLI.setUnavailable(LibFunc::rintf);
+    TLI.setUnavailable(LibFunc::rintl);
+    TLI.setUnavailable(LibFunc::round);
+    TLI.setUnavailable(LibFunc::roundf);
+    TLI.setUnavailable(LibFunc::roundl);
+    TLI.setUnavailable(LibFunc::trunc);
+    TLI.setUnavailable(LibFunc::truncf);
+    TLI.setUnavailable(LibFunc::truncl);
+
+    // Win32 provides some C99 math with mangled names
+    TLI.setAvailableWithName(LibFunc::copysign, "_copysign");
+
+    if (T.getArch() == Triple::x86) {
+      // Win32 on x86 implements single-precision math functions as macros
+      TLI.setUnavailable(LibFunc::acosf);
+      TLI.setUnavailable(LibFunc::asinf);
+      TLI.setUnavailable(LibFunc::atanf);
+      TLI.setUnavailable(LibFunc::atan2f);
+      TLI.setUnavailable(LibFunc::ceilf);
+      TLI.setUnavailable(LibFunc::copysignf);
+      TLI.setUnavailable(LibFunc::cosf);
+      TLI.setUnavailable(LibFunc::coshf);
+      TLI.setUnavailable(LibFunc::expf);
+      TLI.setUnavailable(LibFunc::floorf);
+      TLI.setUnavailable(LibFunc::fmodf);
+      TLI.setUnavailable(LibFunc::logf);
+      TLI.setUnavailable(LibFunc::powf);
+      TLI.setUnavailable(LibFunc::sinf);
+      TLI.setUnavailable(LibFunc::sinhf);
+      TLI.setUnavailable(LibFunc::sqrtf);
+      TLI.setUnavailable(LibFunc::tanf);
+      TLI.setUnavailable(LibFunc::tanhf);
+    }
+  }
 }
 
 
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 1589604..2570e0d 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallString.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index b4969ca..b9b2526 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/GlobalValue.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
@@ -74,6 +75,27 @@ CodeModel::Model TargetMachine::getCodeModel() const {
   return CodeGenInfo->getCodeModel();
 }
 
+TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
+  bool isLocal = GV->hasLocalLinkage();
+  bool isDeclaration = GV->isDeclaration();
+  // FIXME: what should we do for protected and internal visibility?
+  // For variables, is internal different from hidden?
+  bool isHidden = GV->hasHiddenVisibility();
+
+  if (getRelocationModel() == Reloc::PIC_ &&
+      !Options.PositionIndependentExecutable) {
+    if (isLocal || isHidden)
+      return TLSModel::LocalDynamic;
+    else
+      return TLSModel::GeneralDynamic;
+  } else {
+    if (!isDeclaration || isHidden)
+      return TLSModel::LocalExec;
+    else
+      return TLSModel::InitialExec;
+  }
+}
+
 /// getOptLevel - Returns the optimization level: None, Less,
 /// Default, or Aggressive.
 CodeGenOpt::Level TargetMachine::getOptLevel() const {
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
new file mode 100644
index 0000000..d6bba8b
--- /dev/null
+++ b/lib/Target/TargetMachineC.cpp
@@ -0,0 +1,197 @@
+//===-- TargetMachine.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM-C part of TargetMachine.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Core.h"
+#include "llvm-c/Target.h"
+#include "llvm-c/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+using namespace llvm;
+
+
+
+LLVMTargetRef LLVMGetFirstTarget() {
+   const Target* target = &*TargetRegistry::begin();
+   return wrap(target);
+}
+LLVMTargetRef LLVMGetNextTarget(LLVMTargetRef T) {
+  return wrap(unwrap(T)->getNext());
+}
+
+const char * LLVMGetTargetName(LLVMTargetRef T) {
+  return unwrap(T)->getName();
+}
+
+const char * LLVMGetTargetDescription(LLVMTargetRef T) {
+  return unwrap(T)->getShortDescription();
+}
+
+LLVMBool LLVMTargetHasJIT(LLVMTargetRef T) {
+  return unwrap(T)->hasJIT();
+}
+
+LLVMBool LLVMTargetHasTargetMachine(LLVMTargetRef T) {
+  return unwrap(T)->hasTargetMachine();
+}
+
+LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
+  return unwrap(T)->hasMCAsmBackend();
+}
+
+LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, char* Triple,
+  char* CPU, char* Features, LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
+  LLVMCodeModel CodeModel) {
+  Reloc::Model RM;
+  switch (Reloc){
+    case LLVMRelocStatic:
+      RM = Reloc::Static;
+      break;
+    case LLVMRelocPIC:
+      RM = Reloc::PIC_;
+      break;
+    case LLVMRelocDynamicNoPic:
+      RM = Reloc::DynamicNoPIC;
+      break;
+    default:
+      RM = Reloc::Default;
+      break;
+  }
+
+  CodeModel::Model CM;
+  switch (CodeModel) {
+    case LLVMCodeModelJITDefault:
+      CM = CodeModel::JITDefault;
+      break;
+    case LLVMCodeModelSmall:
+      CM = CodeModel::Small;
+      break;
+    case LLVMCodeModelKernel:
+      CM = CodeModel::Kernel;
+      break;
+    case LLVMCodeModelMedium:
+      CM = CodeModel::Medium;
+      break;
+    case LLVMCodeModelLarge:
+      CM = CodeModel::Large;
+      break;
+    default:
+      CM = CodeModel::Default;
+      break;
+  }
+  CodeGenOpt::Level OL;
+
+  switch (Level) {
+    case LLVMCodeGenLevelNone:
+      OL = CodeGenOpt::None;
+      break;
+    case LLVMCodeGenLevelLess:
+      OL = CodeGenOpt::Less;
+      break;
+    case LLVMCodeGenLevelAggressive:
+      OL = CodeGenOpt::Aggressive;
+      break;
+    default:
+      OL = CodeGenOpt::Default;
+      break;
+  }
+
+  TargetOptions opt;
+  return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM,
+    CM, OL));
+}
+
+
+void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) {
+  delete unwrap(T);
+}
+
+LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) {
+  const Target* target = &(unwrap(T)->getTarget());
+  return wrap(target);
+}
+
+char* LLVMGetTargetMachineTriple(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetTriple();
+  return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineCPU(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetCPU();
+  return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetFeatureString();
+  return strdup(StringRep.c_str());
+}
+
+LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
+  return wrap(unwrap(T)->getTargetData());
+}
+
+LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
+  char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
+  TargetMachine* TM = unwrap(T);
+  Module* Mod = unwrap(M);
+
+  PassManager pass;
+
+  std::string error;
+
+  const TargetData* td = TM->getTargetData();
+
+  if (!td) {
+    error = "No TargetData in TargetMachine";
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+  pass.add(new TargetData(*td));
+
+  TargetMachine::CodeGenFileType ft;
+  switch (codegen) {
+    case LLVMAssemblyFile:
+      ft = TargetMachine::CGFT_AssemblyFile;
+      break;
+    default:
+      ft = TargetMachine::CGFT_ObjectFile;
+      break;
+  }
+  raw_fd_ostream dest(Filename, error, raw_fd_ostream::F_Binary);
+  formatted_raw_ostream destf(dest);
+  if (!error.empty()) {
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+
+  if (TM->addPassesToEmitFile(pass, destf, ft)) {
+    error = "No TargetData in TargetMachine";
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+
+  pass.run(*Mod);
+
+  destf.flush();
+  dest.flush();
+  return false;
+}
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 9e88472..08c732c 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -951,20 +950,21 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
   if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
       (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
        PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
-    bool IsVCMP = PatchedName.startswith("vcmp");
+    bool IsVCMP = PatchedName[0] == 'v';
     unsigned SSECCIdx = IsVCMP ? 4 : 3;
     unsigned SSEComparisonCode = StringSwitch<unsigned>(
       PatchedName.slice(SSECCIdx, PatchedName.size() - 2))
-      .Case("eq",          0)
-      .Case("lt",          1)
-      .Case("le",          2)
-      .Case("unord",       3)
-      .Case("neq",         4)
-      .Case("nlt",         5)
-      .Case("nle",         6)
-      .Case("ord",         7)
-      .Case("eq_uq",       8)
-      .Case("nge",         9)
+      .Case("eq",       0x00)
+      .Case("lt",       0x01)
+      .Case("le",       0x02)
+      .Case("unord",    0x03)
+      .Case("neq",      0x04)
+      .Case("nlt",      0x05)
+      .Case("nle",      0x06)
+      .Case("ord",      0x07)
+      /* AVX only from here */
+      .Case("eq_uq",    0x08)
+      .Case("nge",      0x09)
       .Case("ngt",      0x0A)
       .Case("false",    0x0B)
       .Case("neq_oq",   0x0C)
@@ -988,7 +988,7 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
       .Case("gt_oq",    0x1E)
       .Case("true_us",  0x1F)
       .Default(~0U);
-    if (SSEComparisonCode != ~0U) {
+    if (SSEComparisonCode != ~0U && (IsVCMP || SSEComparisonCode < 8)) {
       ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode,
                                           getParser().getContext());
       if (PatchedName.endswith("ss")) {
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 8278bde..b13a006 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -322,7 +322,12 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
 
   OperandType type = (OperandType)operand.type;
 
+  bool isBranch = false;
+  uint64_t pcrel = 0;
   if (type == TYPE_RELv) {
+    isBranch = true;
+    pcrel = insn.startLocation +
+            insn.displacementOffset + insn.displacementSize;
     switch (insn.displacementSize) {
     default:
       break;
@@ -373,8 +378,6 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
     }
   }
 
-  bool isBranch = false;
-  uint64_t pcrel = 0;
   switch (type) {
   case TYPE_XMM128:
     mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index fbd81d2..6020877 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -1527,6 +1527,9 @@ static int readOperands(struct InternalInstruction* insn) {
       if (insn->spec->operands[index].type == TYPE_IMM3 &&
           insn->immediates[insn->numImmediatesConsumed - 1] > 7)
         return -1;
+      if (insn->spec->operands[index].type == TYPE_IMM5 &&
+          insn->immediates[insn->numImmediatesConsumed - 1] > 31)
+        return -1;
       if (insn->spec->operands[index].type == TYPE_XMM128 ||
           insn->spec->operands[index].type == TYPE_XMM256)
         sawRegImm = 1;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index d2e30f1..13e1136 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -273,6 +273,7 @@ struct ContextDecision {
   ENUM_ENTRY(TYPE_IMM32,      "4-byte")                                        \
   ENUM_ENTRY(TYPE_IMM64,      "8-byte")                                        \
   ENUM_ENTRY(TYPE_IMM3,       "1-byte immediate operand between 0 and 7")      \
+  ENUM_ENTRY(TYPE_IMM5,       "1-byte immediate operand between 0 and 31")     \
   ENUM_ENTRY(TYPE_RM8,        "1-byte register or memory operand")             \
   ENUM_ENTRY(TYPE_RM16,       "2-byte")                                        \
   ENUM_ENTRY(TYPE_RM32,       "4-byte")                                        \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index b7ccb4c..5118e4c 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -19,6 +19,8 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
@@ -26,7 +28,6 @@
 using namespace llvm;
 
 // Include the auto-generated portion of the assembly writer.
-#define GET_INSTRUCTION_NAME
 #define PRINT_ALIAS_INSTR
 #include "X86GenAsmWriter.inc"
 
@@ -49,10 +50,6 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
     EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
 }
 
-StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const {
-  return getInstructionName(Opcode);
-}
-
 void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
                                    raw_ostream &O) {
   switch (MI->getOperand(Op).getImm()) {
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index ff94301..2e00bff 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -22,12 +22,12 @@ class MCOperand;
   
 class X86ATTInstPrinter : public MCInstPrinter {
 public:
-  X86ATTInstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MRI) {}
+  X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                    const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
 
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
-  virtual StringRef getOpcodeName(unsigned Opcode) const;
 
   // Autogenerated by tblgen, returns true if we successfully printed an
   // alias.
@@ -36,7 +36,6 @@ public:
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &OS);
   static const char *getRegisterName(unsigned RegNo);
-  static const char *getInstructionName(unsigned Opcode);
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 46a96d2..4ea662c 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -17,15 +17,13 @@
 #include "X86InstComments.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include <cctype>
 using namespace llvm;
 
-// Include the auto-generated portion of the assembly writer.
-#define GET_INSTRUCTION_NAME
 #include "X86GenAsmWriter1.inc"
 
 void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
@@ -43,9 +41,6 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   if (CommentStream)
     EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
 }
-StringRef X86IntelInstPrinter::getOpcodeName(unsigned Opcode) const {
-  return getInstructionName(Opcode);
-}
 
 void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
                                      raw_ostream &O) {
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index ea1d38a..4f5938d 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -23,17 +23,16 @@ class MCOperand;
   
 class X86IntelInstPrinter : public MCInstPrinter {
 public:
-  X86IntelInstPrinter(const MCAsmInfo &MAI, const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MRI) {}
+  X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
 
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
-  virtual StringRef getOpcodeName(unsigned Opcode) const;
   
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
-  static const char *getInstructionName(unsigned Opcode);
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 3f770f7..32e40fe 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -9,7 +9,6 @@
 
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 003a14a..afa545c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -155,4 +155,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index efd18c7..3482363 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -474,12 +474,13 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 static MCInstPrinter *createX86MCInstPrinter(const Target &T,
                                              unsigned SyntaxVariant,
                                              const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
                                              const MCRegisterInfo &MRI,
                                              const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
-    return new X86ATTInstPrinter(MAI, MRI);
+    return new X86ATTInstPrinter(MAI, MII, MRI);
   if (SyntaxVariant == 1)
-    return new X86IntelInstPrinter(MAI, MRI);
+    return new X86IntelInstPrinter(MAI, MII, MRI);
   return 0;
 }
 
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index f9c1d35..6a8a4fd 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2060,3 +2060,21 @@ Instead we could generate:
 The trick is to match "fetch_and_add(X, -C) == C".
 
 //===---------------------------------------------------------------------===//
+
+unsigned t(unsigned a, unsigned b) {
+  return a <= b ? 5 : -5;
+}
+
+We generate:
+	movl	$5, %ecx
+	cmpl	%esi, %edi
+	movl	$-5, %eax
+	cmovbel	%ecx, %eax
+
+GCC:
+	cmpl	%edi, %esi
+	sbbl	%eax, %eax
+	andl	$-10, %eax
+	addl	$5, %eax
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 32c722a..a802333 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -169,6 +169,9 @@ void DecodeUNPCKLMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
 
 void DecodeVPERM2X128Mask(EVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask) {
+  if (Imm & 0x88)
+    return; // Not a shuffle
+
   unsigned HalfSize = VT.getVectorNumElements()/2;
   unsigned FstHalfBegin = (Imm & 0x3) * HalfSize;
   unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index f1cedf3..7db7ccb 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -19,7 +19,6 @@
 #include "X86MachineFunctionInfo.h"
 #include "X86TargetMachine.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
-#include "InstPrinter/X86IntelInstPrinter.h"
 #include "llvm/CallingConv.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
@@ -265,8 +264,8 @@ void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
 void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op,
                                raw_ostream &O) {
   unsigned char value = MI->getOperand(Op).getImm();
-  assert(value <= 7 && "Invalid ssecc argument!");
   switch (value) {
+  default: llvm_unreachable("Invalid ssecc argument!");
   case    0: O << "eq"; break;
   case    1: O << "lt"; break;
   case    2: O << "le"; break;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 3d63b7e..69752c5 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -2179,7 +2179,7 @@ bool X86FastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
 
 
 namespace llvm {
-  llvm::FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo) {
+  FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo) {
     return new X86FastISel(funcInfo);
   }
 }
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 936df27..ed1707d 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -28,7 +28,6 @@
 #include "X86InstrInfo.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -219,7 +218,7 @@ namespace {
     /// getSTReg - Return the X86::ST(i) register which contains the specified
     /// FP<RegNo> register.
     unsigned getSTReg(unsigned RegNo) const {
-      return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0;
+      return StackTop - 1 - getSlot(RegNo) + X86::ST0;
     }
 
     // pushReg - Push the specified FP<n> register onto the stack.
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9405c2f..8e2b1d6 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
@@ -621,14 +620,14 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
 
   // Handle X86-64 rip-relative addresses.  We check this before checking direct
   // folding because RIP is preferable to non-RIP accesses.
-  if (Subtarget->is64Bit() &&
+  if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
       // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
       // they cannot be folded into immediate fields.
       // FIXME: This can be improved for kernel and other models?
-      (M == CodeModel::Small || M == CodeModel::Kernel) &&
-      // Base and index reg must be 0 in order to use %rip as base and lowering
-      // must allow RIP.
-      !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) {
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    // Base and index reg must be 0 in order to use %rip as base.
+    if (AM.hasBaseOrIndexReg())
+      return true;
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
       X86ISelAddressMode Backup = AM;
       AM.GV = G->getGlobal();
@@ -663,11 +662,12 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
   }
 
   // Handle the case when globals fit in our immediate field: This is true for
-  // X86-32 always and X86-64 when in -static -mcmodel=small mode.  In 64-bit
-  // mode, this results in a non-RIP-relative computation.
+  // X86-32 always and X86-64 when in -mcmodel=small mode.  In 64-bit
+  // mode, this only applies to a non-RIP-relative computation.
   if (!Subtarget->is64Bit() ||
-      ((M == CodeModel::Small || M == CodeModel::Kernel) &&
-       TM.getRelocationModel() == Reloc::Static)) {
+      M == CodeModel::Small || M == CodeModel::Kernel) {
+    assert(N.getOpcode() != X86ISD::WrapperRIP &&
+           "RIP-relative addressing already handled");
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
       AM.GV = G->getGlobal();
       AM.Disp += G->getOffset();
@@ -897,7 +897,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
                                                MaskLZ);
   APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(X, MaskedHighBits, KnownZero, KnownOne);
+  DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
   if (MaskedHighBits != KnownZero) return true;
 
   // We've identified a pattern that can be transformed into a single shift
@@ -1848,6 +1848,96 @@ static bool HasNoSignedComparisonUses(SDNode *N) {
   return true;
 }
 
+/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
+/// is suitable for doing the {load; increment or decrement; store} to modify
+/// transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, 
+                                SDValue StoredVal, SelectionDAG *CurDAG,
+                                LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+  // is the value stored the result of a DEC or INC?
+  if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+  // is the stored value result 0 of the load?
+  if (StoredVal.getResNo() != 0) return false;
+
+  // are there other uses of the loaded value than the inc or dec?
+  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+  // is the store non-extending and non-indexed?
+  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+    return false;
+
+  SDValue Load = StoredVal->getOperand(0);
+  // Is the stored value a non-extending and non-indexed load?
+  if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+  // Return LoadNode by reference.
+  LoadNode = cast<LoadSDNode>(Load);
+  // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+  EVT LdVT = LoadNode->getMemoryVT();    
+  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 && 
+      LdVT != MVT::i8)
+    return false;
+
+  // Is store the only read of the loaded value?
+  if (!Load.hasOneUse())
+    return false;
+  
+  // Is the address of the store the same as the load?
+  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+      LoadNode->getOffset() != StoreNode->getOffset())
+    return false;
+
+  // Check if the chain is produced by the load or is a TokenFactor with
+  // the load output chain as an operand. Return InputChain by reference.
+  SDValue Chain = StoreNode->getChain();
+
+  bool ChainCheck = false;
+  if (Chain == Load.getValue(1)) {
+    ChainCheck = true;
+    InputChain = LoadNode->getChain();
+  } else if (Chain.getOpcode() == ISD::TokenFactor) {
+    SmallVector<SDValue, 4> ChainOps;
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+      SDValue Op = Chain.getOperand(i);
+      if (Op == Load.getValue(1)) {
+        ChainCheck = true;
+        continue;
+      }
+      ChainOps.push_back(Op);
+    }
+
+    if (ChainCheck)
+      // Make a new TokenFactor with all the other input chains except
+      // for the load.
+      InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(),
+                                   MVT::Other, &ChainOps[0], ChainOps.size());
+  }
+  if (!ChainCheck)
+    return false;
+
+  return true;
+}
+
+/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory
+/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+  if (Opc == X86ISD::DEC) {
+    if (LdVT == MVT::i64) return X86::DEC64m;
+    if (LdVT == MVT::i32) return X86::DEC32m;
+    if (LdVT == MVT::i16) return X86::DEC16m;
+    if (LdVT == MVT::i8)  return X86::DEC8m;
+  } else {
+    assert(Opc == X86ISD::INC && "unrecognized opcode");
+    if (LdVT == MVT::i64) return X86::INC64m;
+    if (LdVT == MVT::i32) return X86::INC32m;
+    if (LdVT == MVT::i16) return X86::INC16m;
+    if (LdVT == MVT::i8)  return X86::INC8m;
+  }
+  llvm_unreachable("unrecognized size for LdVT");
+}
+
 SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   EVT NVT = Node->getValueType(0);
   unsigned Opc, MOpc;
@@ -2355,9 +2445,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     break;
   }
   case ISD::STORE: {
+    // Change a chain of {load; incr or dec; store} of the same value into
+    // a simple increment or decrement through memory of that value, if the
+    // uses of the modified value and its address are suitable.
     // The DEC64m tablegen pattern is currently not able to match the case where
-    // the EFLAGS on the original DEC are used.
-    // we'll need to improve tablegen to allow flags to be transferred from a
+    // the EFLAGS on the original DEC are used. (This also applies to 
+    // {INC,DEC}X{64,32,16,8}.)
+    // We'll need to improve tablegen to allow flags to be transferred from a
     // node in the pattern to the result node.  probably with a new keyword
     // for example, we have this
     // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
@@ -2367,44 +2461,17 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
     //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
     //   (transferrable EFLAGS)]>;
+
     StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
-    SDValue Chain = StoreNode->getOperand(0);
     SDValue StoredVal = StoreNode->getOperand(1);
-    SDValue Address = StoreNode->getOperand(2);
-    SDValue Undef = StoreNode->getOperand(3);
-
-    if (StoreNode->getMemOperand()->getSize() != 8 ||
-        Undef->getOpcode() != ISD::UNDEF ||
-        Chain->getOpcode() != ISD::LOAD ||
-        StoredVal->getOpcode() != X86ISD::DEC ||
-        StoredVal.getResNo() != 0 ||
-        !StoredVal.getNode()->hasNUsesOfValue(1, 0) ||
-        !Chain.getNode()->hasNUsesOfValue(1, 0) ||
-        StoredVal->getOperand(0).getNode() != Chain.getNode())
-      break;
+    unsigned Opc = StoredVal->getOpcode();
 
-    //OPC_CheckPredicate, 1, // Predicate_nontemporalstore
-    if (StoreNode->isNonTemporal())
+    LoadSDNode *LoadNode = 0;
+    SDValue InputChain;
+    if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+                             LoadNode, InputChain))
       break;
 
-    LoadSDNode *LoadNode = cast<LoadSDNode>(Chain.getNode());
-    if (LoadNode->getOperand(1) != Address ||
-        LoadNode->getOperand(2) != Undef)
-      break;
-
-    if (!ISD::isNormalLoad(LoadNode))
-      break;
-
-    if (!ISD::isNormalStore(StoreNode))
-      break;
-
-    // check load chain has only one use (from the store)
-    if (!Chain.hasOneUse())
-      break;
-
-    // Merge the input chains if they are not intra-pattern references.
-    SDValue InputChain = LoadNode->getOperand(0);
-
     SDValue Base, Scale, Index, Disp, Segment;
     if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
                     Base, Scale, Index, Disp, Segment))
@@ -2414,7 +2481,9 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     MemOp[0] = StoreNode->getMemOperand();
     MemOp[1] = LoadNode->getMemOperand();
     const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
-    MachineSDNode *Result = CurDAG->getMachineNode(X86::DEC64m,
+    EVT LdVT = LoadNode->getMemoryVT();    
+    unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+    MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
                                                    Node->getDebugLoc(),
                                                    MVT::i32, MVT::Other, Ops,
                                                    array_lengthof(Ops));
@@ -2465,6 +2534,6 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
 /// X86-specific DAG, ready for instruction scheduling.
 ///
 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
-                                     llvm::CodeGenOpt::Level OptLevel) {
+                                     CodeGenOpt::Level OptLevel) {
   return new X86DAGToDAGISel(TM, OptLevel);
 }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 88f3829..04299f3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1578,18 +1578,20 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                      MVT::Other, &RetOps[0], RetOps.size());
 }
 
-bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   if (N->getNumValues() != 1)
     return false;
   if (!N->hasNUsesOfValue(1, 0))
     return false;
 
+  SDValue TCChain = Chain;
   SDNode *Copy = *N->use_begin();
   if (Copy->getOpcode() == ISD::CopyToReg) {
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.
     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
       return false;
+    TCChain = Copy->getOperand(0);
   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
     return false;
 
@@ -1601,7 +1603,11 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
     HasRet = true;
   }
 
-  return HasRet;
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
 }
 
 EVT
@@ -2929,6 +2935,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
   case X86ISD::VPERMILP:
+  case X86ISD::VPERMI:
     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   }
 }
@@ -3970,6 +3977,27 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
   return Index / NumElemsPerChunk;
 }
 
+/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
+/// Handles 256-bit.
+static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is256BitVector() && NumElts == 4) &&
+         "Unsupported vector type for VPERMQ/VPERMPD");
+
+  unsigned Mask = 0;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt < 0)
+      continue;
+    Mask |= Elt << (i*2);
+  }
+
+  return Mask;
+}
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
@@ -4402,6 +4430,7 @@ static bool getTargetShuffleMask(SDNode *N, EVT VT,
   case X86ISD::VPERM2X128:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    if (Mask.empty()) return false;
     break;
   case X86ISD::MOVDDUP:
   case X86ISD::MOVLHPD:
@@ -4852,41 +4881,42 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   return SDValue();
 }
 
-/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
-/// a vbroadcast node. We support two patterns:
-/// 1. A splat BUILD_VECTOR which uses a single scalar load.
+/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
+/// to generate a splat value for the following cases:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
-/// a scalar load.
-/// The scalar load node is returned when a pattern is found,
+/// a scalar load, or a constant.
+/// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
-static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) {
+SDValue
+X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
   if (!Subtarget->hasAVX())
     return SDValue();
 
   EVT VT = Op.getValueType();
-  SDValue V = Op;
-
-  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
 
-  //A suspected load to be broadcasted.
   SDValue Ld;
+  bool ConstSplatVal;
 
-  switch (V.getOpcode()) {
+  switch (Op.getOpcode()) {
     default:
       // Unknown pattern found.
       return SDValue();
 
     case ISD::BUILD_VECTOR: {
       // The BUILD_VECTOR node must be a splat.
-      if (!isSplatVector(V.getNode()))
+      if (!isSplatVector(Op.getNode()))
         return SDValue();
 
-      Ld = V.getOperand(0);
+      Ld = Op.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                     Ld.getOpcode() == ISD::ConstantFP);
 
       // The suspected load node has several users. Make sure that all
       // of its users are from the BUILD_VECTOR node.
-      if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+      // Constants may have multiple users.
+      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
         return SDValue();
       break;
     }
@@ -4904,15 +4934,50 @@ static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) {
         return SDValue();
 
       Ld = Sc.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                       Ld.getOpcode() == ISD::ConstantFP);
 
       // The scalar_to_vector node and the suspected
       // load node must have exactly one user.
-      if (!Sc.hasOneUse() || !Ld.hasOneUse())
+      // Constants may have multiple users.
+      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
         return SDValue();
       break;
     }
   }
 
+  bool Is256 = VT.getSizeInBits() == 256;
+  bool Is128 = VT.getSizeInBits() == 128;
+
+  // Handle the broadcasting a single constant scalar from the constant pool
+  // into a vector. On Sandybridge it is still better to load a constant vector
+  // from the constant pool and not to broadcast it from a scalar.
+  if (ConstSplatVal && Subtarget->hasAVX2()) {
+    EVT CVT = Ld.getValueType();
+    assert(!CVT.isVector() && "Must not broadcast a vector type");
+    unsigned ScalarSize = CVT.getSizeInBits();
+
+    if ((Is256 && (ScalarSize == 32 || ScalarSize == 64)) ||
+        (Is128 && (ScalarSize == 32))) {
+
+      const Constant *C = 0;
+      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+        C = CI->getConstantIntValue();
+      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+        C = CF->getConstantFPValue();
+
+      assert(C && "Invalid constant type");
+
+      SDValue CP = DAG.getConstantPool(C, getPointerTy());
+      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, Alignment);
+
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+    }
+  }
+
   // The scalar source must be a normal load.
   if (!ISD::isNormalLoad(Ld.getNode()))
     return SDValue();
@@ -4921,28 +4986,26 @@ static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) {
   if (Ld->hasAnyUseOfValue(1))
     return SDValue();
 
-  bool Is256 = VT.getSizeInBits() == 256;
-  bool Is128 = VT.getSizeInBits() == 128;
   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
 
   // VBroadcast to YMM
   if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
-    return Ld;
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // VBroadcast to XMM
   if (Is128 && (ScalarSize == 32))
-    return Ld;
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   // double since there is vbroadcastsd xmm
   if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
     // VBroadcast to YMM
     if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
-      return Ld;
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
     // VBroadcast to XMM
     if (Is128 && (ScalarSize ==  8 || ScalarSize == 16 || ScalarSize == 64))
-      return Ld;
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   }
 
   // Unsupported broadcast.
@@ -4977,9 +5040,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
   }
 
-  SDValue LD = isVectorBroadcast(Op, Subtarget);
-  if (LD.getNode())
-    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+  if (Broadcast.getNode())
+    return Broadcast;
 
   unsigned EVTBits = ExtVT.getSizeInBits();
 
@@ -5343,6 +5406,85 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return LowerAVXCONCAT_VECTORS(Op, DAG);
 }
 
+// Try to lower a shuffle node into a simple blend instruction.
+static SDValue LowerVECTOR_SHUFFLEtoBlend(SDValue Op,
+                                          const X86Subtarget *Subtarget,
+                                          SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT InVT = V1.getValueType();
+  int MaskSize = VT.getVectorNumElements();
+  int InSize = InVT.getVectorNumElements();
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (MaskSize != InSize)
+    return SDValue();
+
+  int ISDNo = 0;
+  MVT OpTy;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return SDValue();
+  case MVT::v8i16:
+           ISDNo = X86ISD::BLENDPW;
+           OpTy = MVT::v8i16;
+           break;
+  case MVT::v4i32:
+  case MVT::v4f32:
+           ISDNo = X86ISD::BLENDPS;
+           OpTy = MVT::v4f32;
+           break;
+  case MVT::v2i64:
+  case MVT::v2f64:
+           ISDNo = X86ISD::BLENDPD;
+           OpTy = MVT::v2f64;
+           break;
+  case MVT::v8i32:
+  case MVT::v8f32:
+           if (!Subtarget->hasAVX())
+             return SDValue();
+           ISDNo = X86ISD::BLENDPS;
+           OpTy = MVT::v8f32;
+           break;
+  case MVT::v4i64:
+  case MVT::v4f64:
+           if (!Subtarget->hasAVX())
+             return SDValue();
+           ISDNo = X86ISD::BLENDPD;
+           OpTy = MVT::v4f64;
+           break;
+  case MVT::v16i16:
+           if (!Subtarget->hasAVX2())
+             return SDValue();
+           ISDNo = X86ISD::BLENDPW;
+           OpTy = MVT::v16i16;
+           break;
+  }
+  assert(ISDNo && "Invalid Op Number");
+
+  unsigned MaskVals = 0;
+
+  for (int i = 0; i < MaskSize; ++i) {
+    int EltIdx = SVOp->getMaskElt(i);
+    if (EltIdx == i || EltIdx == -1)
+      MaskVals |= (1<<i);
+    else if (EltIdx == (i + MaskSize))
+      continue; // Bit is set to zero;
+    else return SDValue();
+  }
+
+  V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
+  V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
+  SDValue Ret =  DAG.getNode(ISDNo, dl, OpTy, V1, V2,
+                             DAG.getConstant(MaskVals, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
 // v8i16 shuffles - Prefer shuffles in the following order:
 // 1. [all]   pshuflw, pshufhw, optional move
 // 2. [ssse3] 1 x pshufb
@@ -5836,96 +5978,79 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLaneElems = NumElems / 2;
 
-  int MinRange[2][2] = { { static_cast<int>(NumElems),
-                           static_cast<int>(NumElems) },
-                         { static_cast<int>(NumElems),
-                           static_cast<int>(NumElems) } };
-  int MaxRange[2][2] = { { -1, -1 }, { -1, -1 } };
+  DebugLoc dl = SVOp->getDebugLoc();
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
+  SDValue Shufs[2];
 
-  // Collect used ranges for each source in each lane
+  SmallVector<int, 16> Mask;
   for (unsigned l = 0; l < 2; ++l) {
-    unsigned LaneStart = l*NumLaneElems;
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with useBuildVector set.
+    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
+    unsigned LaneStart = l * NumLaneElems;
     for (unsigned i = 0; i != NumLaneElems; ++i) {
+      // The mask element.  This indexes into the input.
       int Idx = SVOp->getMaskElt(i+LaneStart);
-      if (Idx < 0)
+      if (Idx < 0) {
+        // the mask element does not index into any input vector.
+        Mask.push_back(-1);
         continue;
-
-      int Input = 0;
-      if (Idx >= (int)NumElems) {
-        Idx -= NumElems;
-        Input = 1;
       }
 
-      if (Idx > MaxRange[l][Input])
-        MaxRange[l][Input] = Idx;
-      if (Idx < MinRange[l][Input])
-        MinRange[l][Input] = Idx;
-    }
-  }
+      // The input vector this mask element indexes into.
+      int Input = Idx / NumLaneElems;
 
-  // Make sure each range is 128-bits
-  int ExtractIdx[2][2] = { { -1, -1 }, { -1, -1 } };
-  for (unsigned l = 0; l < 2; ++l) {
-    for (unsigned Input = 0; Input < 2; ++Input) {
-      if (MinRange[l][Input] == (int)NumElems && MaxRange[l][Input] < 0)
-        continue;
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NumLaneElems;
 
-      if (MinRange[l][Input] >= 0 && MaxRange[l][Input] < (int)NumLaneElems)
-        ExtractIdx[l][Input] = 0;
-      else if (MinRange[l][Input] >= (int)NumLaneElems &&
-               MaxRange[l][Input] < (int)NumElems)
-        ExtractIdx[l][Input] = NumLaneElems;
-      else
-        return SDValue();
-    }
-  }
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input)
+          // This input vector is already an operand.
+          break;
+        if (InputUsed[OpNo] < 0) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
 
-  DebugLoc dl = SVOp->getDebugLoc();
-  MVT EltVT = VT.getVectorElementType().getSimpleVT();
-  EVT NVT = MVT::getVectorVT(EltVT, NumElems/2);
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used! Give up.
+        return SDValue();
+      }
 
-  SDValue Ops[2][2];
-  for (unsigned l = 0; l < 2; ++l) {
-    for (unsigned Input = 0; Input < 2; ++Input) {
-      if (ExtractIdx[l][Input] >= 0)
-        Ops[l][Input] = Extract128BitVector(SVOp->getOperand(Input),
-                                DAG.getConstant(ExtractIdx[l][Input], MVT::i32),
-                                                DAG, dl);
-      else
-        Ops[l][Input] = DAG.getUNDEF(NVT);
+      // Add the mask index for the new shuffle vector.
+      Mask.push_back(Idx + OpNo * NumLaneElems);
     }
-  }
 
-  // Generate 128-bit shuffles
-  SmallVector<int, 16> Mask1, Mask2;
-  for (unsigned i = 0; i != NumLaneElems; ++i) {
-    int Elt = SVOp->getMaskElt(i);
-    if (Elt >= (int)NumElems) {
-      Elt %= NumLaneElems;
-      Elt += NumLaneElems;
-    } else if (Elt >= 0) {
-      Elt %= NumLaneElems;
-    }
-    Mask1.push_back(Elt);
-  }
-  for (unsigned i = NumLaneElems; i != NumElems; ++i) {
-    int Elt = SVOp->getMaskElt(i);
-    if (Elt >= (int)NumElems) {
-      Elt %= NumLaneElems;
-      Elt += NumLaneElems;
-    } else if (Elt >= 0) {
-      Elt %= NumLaneElems;
+    if (InputUsed[0] < 0) {
+      // No input vectors were used! The result is undefined.
+      Shufs[l] = DAG.getUNDEF(NVT);
+    } else {
+      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
+                   DAG.getConstant((InputUsed[0] % 2) * NumLaneElems, MVT::i32),
+                                   DAG, dl);
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
+        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
+                   DAG.getConstant((InputUsed[1] % 2) * NumLaneElems, MVT::i32),
+                                   DAG, dl);
+      // At least one input vector was used. Create a new shuffle vector.
+      Shufs[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
     }
-    Mask2.push_back(Elt);
-  }
 
-  SDValue Shuf1 = DAG.getVectorShuffle(NVT, dl, Ops[0][0], Ops[0][1], &Mask1[0]);
-  SDValue Shuf2 = DAG.getVectorShuffle(NVT, dl, Ops[1][0], Ops[1][1], &Mask2[0]);
+    Mask.clear();
+  }
 
   // Concatenate the result back
-  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shuf1,
+  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shufs[0],
                                  DAG.getConstant(0, MVT::i32), DAG, dl);
-  return Insert128BitVector(V, Shuf2, DAG.getConstant(NumElems/2, MVT::i32),
+  return Insert128BitVector(V, Shufs[1],DAG.getConstant(NumLaneElems, MVT::i32),
                             DAG, dl);
 }
 
@@ -6203,10 +6328,8 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                               getShuffleSHUFImmediate(SVOp), DAG);
 }
 
-static
-SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
-                               const TargetLowering &TLI,
-                               const X86Subtarget *Subtarget) {
+SDValue
+X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
@@ -6222,9 +6345,9 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
     int Size = VT.getSizeInBits();
 
     // Use vbroadcast whenever the splat comes from a foldable load
-    SDValue LD = isVectorBroadcast(Op, Subtarget);
-    if (LD.getNode())
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+    if (Broadcast.getNode())
+      return Broadcast;
 
     // Handle splats by matching through known shuffle masks
     if ((Size == 128 && NumElem <= 4) ||
@@ -6309,7 +6432,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // Normalize the input vectors. Here splats, zeroed vectors, profitable
   // narrowing and commutation of operands should be handled. The actual code
   // doesn't include all of those, work in progress...
-  SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget);
+  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
   if (NewOp.getNode())
     return NewOp;
 
@@ -6524,6 +6647,27 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
 
+  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
+  if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
+    SmallVector<SDValue, 8> permclMask;
+    for (unsigned i = 0; i != 8; ++i) {
+      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
+    }
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
+                               &permclMask[0], 8);
+    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+    return DAG.getNode(X86ISD::VPERMV, dl, VT,
+                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+  }
+
+  if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64))
+    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
+                                getShuffleCLImmediate(SVOp), DAG);
+
+
   //===--------------------------------------------------------------------===//
   // Since no target specific shuffle was selected for this generic one,
   // lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -7182,8 +7326,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
       GV = GA->resolveAliasedGlobal(false);
 
-    TLSModel::Model model
-      = getTLSModel(GV, getTargetMachine().getRelocationModel());
+    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
     switch (model) {
       case TLSModel::GeneralDynamic:
@@ -8099,8 +8242,8 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
         unsigned BitWidth = Op0.getValueSizeInBits();
         unsigned AndBitWidth = And.getValueSizeInBits();
         if (BitWidth > AndBitWidth) {
-          APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones;
-          DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones);
+          APInt Zeros, Ones;
+          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
             return SDValue();
         }
@@ -9449,12 +9592,12 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
   case Intrinsic::x86_avx2_vperm2i128:
     return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-  case Intrinsic::x86_avx_vpermil_ps:
-  case Intrinsic::x86_avx_vpermil_pd:
-  case Intrinsic::x86_avx_vpermil_ps_256:
-  case Intrinsic::x86_avx_vpermil_pd_256:
-    return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    // Operands intentionally swapped. Mask is last operand to intrinsic,
+    // but second operand for node/intruction.
+    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(1));
 
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
@@ -10963,6 +11106,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   case X86ISD::BLENDV:             return "X86ISD::BLENDV";
+  case X86ISD::BLENDPW:            return "X86ISD::BLENDPW";
+  case X86ISD::BLENDPS:            return "X86ISD::BLENDPS";
+  case X86ISD::BLENDPD:            return "X86ISD::BLENDPD";
   case X86ISD::HADD:               return "X86ISD::HADD";
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
@@ -11035,6 +11181,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
+  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
+  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
@@ -11192,14 +11340,15 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
                                                        unsigned notOpc,
                                                        unsigned EAXreg,
                                                  const TargetRegisterClass *RC,
-                                                       bool invSrc) const {
+                                                       bool Invert) const {
   // For the atomic bitwise operator, we generate
   //   thisMBB:
   //   newMBB:
   //     ld  t1 = [bitinstr.addr]
   //     op  t2 = t1, [bitinstr.val]
+  //     not t3 = t2  (if Invert)
   //     mov EAX = t1
-  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     lcs dest = [bitinstr.addr], t3  [EAX is implicit]
   //     bz  newMBB
   //     fallthrough -->nextMBB
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -11247,13 +11396,6 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
   for (int i=0; i <= lastAddrIndx; ++i)
     (*MIB).addOperand(*argOpers[i]);
 
-  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
-  if (invSrc) {
-    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
-  }
-  else
-    tt = t1;
-
   unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
   assert((argOpers[valArgIndx]->isReg() ||
           argOpers[valArgIndx]->isImm()) &&
@@ -11262,16 +11404,23 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
     MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
   else
     MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
-  MIB.addReg(tt);
+  MIB.addReg(t1);
   (*MIB).addOperand(*argOpers[valArgIndx]);
 
+  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
+  if (Invert) {
+    MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
+  }
+  else
+    t3 = t2;
+
   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
   MIB.addReg(t1);
 
   MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
   for (int i=0; i <= lastAddrIndx; ++i)
     (*MIB).addOperand(*argOpers[i]);
-  MIB.addReg(t2);
+  MIB.addReg(t3);
   assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
   (*MIB).setMemRefs(bInstr->memoperands_begin(),
                     bInstr->memoperands_end());
@@ -11294,7 +11443,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
                                                        unsigned regOpcH,
                                                        unsigned immOpcL,
                                                        unsigned immOpcH,
-                                                       bool invSrc) const {
+                                                       bool Invert) const {
   // For the atomic bitwise operator, we generate
   //   thisMBB (instructions are in pairs, except cmpxchg8b)
   //     ld t1,t2 = [bitinstr.addr]
@@ -11302,6 +11451,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
   //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
   //     op  t5, t6 <- out1, out2, [bitinstr.val]
   //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
+  //     neg t7, t8 < t5, t6  (if Invert)
   //     mov ECX, EBX <- t5, t6
   //     mov EAX, EDX <- t1, t2
   //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
@@ -11385,16 +11535,9 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
     .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
 
   // The subsequent operations should be using the destination registers of
-  //the PHI instructions.
-  if (invSrc) {
-    t1 = F->getRegInfo().createVirtualRegister(RC);
-    t2 = F->getRegInfo().createVirtualRegister(RC);
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
-  } else {
-    t1 = dest1Oper.getReg();
-    t2 = dest2Oper.getReg();
-  }
+  // the PHI instructions.
+  t1 = dest1Oper.getReg();
+  t2 = dest2Oper.getReg();
 
   int valArgIndx = lastAddrIndx + 1;
   assert((argOpers[valArgIndx]->isReg() ||
@@ -11421,15 +11564,26 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
     MIB.addReg(t2);
   (*MIB).addOperand(*argOpers[valArgIndx + 1]);
 
+  unsigned t7, t8;
+  if (Invert) {
+    t7 = F->getRegInfo().createVirtualRegister(RC);
+    t8 = F->getRegInfo().createVirtualRegister(RC);
+    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
+    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
+  } else {
+    t7 = t5;
+    t8 = t6;
+  }
+
   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
   MIB.addReg(t1);
   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
   MIB.addReg(t2);
 
   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
-  MIB.addReg(t5);
+  MIB.addReg(t7);
   MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
-  MIB.addReg(t6);
+  MIB.addReg(t8);
 
   MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
   for (int i=0; i <= lastAddrIndx; ++i)
@@ -12620,11 +12774,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 //===----------------------------------------------------------------------===//
 
 void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       const APInt &Mask,
                                                        APInt &KnownZero,
                                                        APInt &KnownOne,
                                                        const SelectionDAG &DAG,
                                                        unsigned Depth) const {
+  unsigned BitWidth = KnownZero.getBitWidth();
   unsigned Opc = Op.getOpcode();
   assert((Opc >= ISD::BUILTIN_OP_END ||
           Opc == ISD::INTRINSIC_WO_CHAIN ||
@@ -12633,7 +12787,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
          "Should use MaskedValueIsZero if you don't know whether Op"
          " is a target node!");
 
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
   switch (Opc) {
   default: break;
   case X86ISD::ADD:
@@ -12652,8 +12806,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
       break;
     // Fallthrough
   case X86ISD::SETCC:
-    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
-                                       Mask.getBitWidth() - 1);
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
     break;
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -12678,8 +12831,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
       }
-      KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(),
-                                        Mask.getBitWidth() - NumLoBits);
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
       break;
     }
     }
@@ -14000,13 +14152,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
         return SDValue();
 
       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
-      if (Mask.getOpcode() != ISD::BITCAST ||
-          X.getOpcode() != ISD::BITCAST ||
-          Y.getOpcode() != ISD::BITCAST)
-        return SDValue();
-
       // Look through mask bitcast.
-      Mask = Mask.getOperand(0);
+      if (Mask.getOpcode() == ISD::BITCAST)
+        Mask = Mask.getOperand(0);
+      if (X.getOpcode() == ISD::BITCAST)
+        X = X.getOperand(0);
+      if (Y.getOpcode() == ISD::BITCAST)
+        Y = Y.getOperand(0);
+
       EVT MaskVT = Mask.getValueType();
 
       // Validate that the Mask operand is a vector sra node.
@@ -14027,8 +14180,6 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
       // Now we know we at least have a plendvb with the mask val.  See if
       // we can form a psignb/w/d.
       // psign = x.type == y.type == mask.type && y = sub(0, x);
-      X = X.getOperand(0);
-      Y = Y.getOperand(0);
       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 0327b1f..09116e8 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -175,9 +175,14 @@ namespace llvm {
       /// PSIGN - Copy integer sign.
       PSIGN,
 
-      /// BLEND family of opcodes
+      /// BLENDV - Blend where the selector is an XMM.
       BLENDV,
 
+      /// BLENDxx - Blend where the selector is an immediate.
+      BLENDPW,
+      BLENDPS,
+      BLENDPD,
+
       /// HADD - Integer horizontal add.
       HADD,
 
@@ -280,6 +285,8 @@ namespace llvm {
       UNPCKL,
       UNPCKH,
       VPERMILP,
+      VPERMV,
+      VPERMI,
       VPERM2X128,
       VBROADCAST,
 
@@ -504,7 +511,6 @@ namespace llvm {
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
     virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                const APInt &Mask,
                                                 APInt &KnownZero,
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
@@ -781,6 +787,8 @@ namespace llvm {
 
     // Utility functions to help LowerVECTOR_SHUFFLE
     SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const;
+    SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -804,7 +812,7 @@ namespace llvm {
                   const SmallVectorImpl<SDValue> &OutVals,
                   DebugLoc dl, SelectionDAG &DAG) const;
 
-    virtual bool isUsedByReturnOnly(SDNode *N) const;
+    virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
 
     virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
 
@@ -849,7 +857,7 @@ namespace llvm {
                                                     unsigned notOpc,
                                                     unsigned EAXreg,
                                               const TargetRegisterClass *RC,
-                                                    bool invSrc = false) const;
+                                                    bool Invert = false) const;
 
     MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
                                                     MachineInstr *BInstr,
@@ -858,7 +866,7 @@ namespace llvm {
                                                     unsigned regOpcH,
                                                     unsigned immOpcL,
                                                     unsigned immOpcH,
-                                                    bool invSrc = false) const;
+                                                    bool Invert = false) const;
 
     /// Utility function to emit atomic min and max.  It takes the min/max
     /// instruction to expand, the associated basic block, and the associated
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 7fa7499..0eee083 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -53,7 +53,7 @@ def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
                // This probably ought to be moved to a def : Pat<> if the
                // syntax can be accepted.
                [(set AL, (mul AL, GR8:$src)),
-                (implicit EFLAGS)]>;     // AL,AH = AL*GR8
+                (implicit EFLAGS)], IIC_MUL8>;     // AL,AH = AL*GR8
 
 let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
 def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
@@ -97,31 +97,32 @@ def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
 
 let neverHasSideEffects = 1 in {
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", []>;
-              // AL,AH = AL*GR8
+def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", [],
+              IIC_IMUL8>; // AL,AH = AL*GR8
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", []>,
-              OpSize;    // AX,DX = AX*GR16
+def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", [],
+              IIC_IMUL16_RR>, OpSize;    // AX,DX = AX*GR16
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", []>;
-              // EAX,EDX = EAX*GR32
+def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", [],
+              IIC_IMUL32_RR>; // EAX,EDX = EAX*GR32
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", []>;
-              // RAX,RDX = RAX*GR64
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
+              IIC_IMUL64_RR>; // RAX,RDX = RAX*GR64
 
 let mayLoad = 1 in {
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
-                "imul{b}\t$src", []>;    // AL,AH = AL*[mem8]
+                "imul{b}\t$src", [], IIC_IMUL8>;    // AL,AH = AL*[mem8]
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
-                "imul{w}\t$src", []>, OpSize; // AX,DX = AX*[mem16]
+                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize;
+                // AX,DX = AX*[mem16]
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
-                "imul{l}\t$src", []>;  // EAX,EDX = EAX*[mem32]
+                "imul{l}\t$src", [], IIC_IMUL32_MEM>;  // EAX,EDX = EAX*[mem32]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
-                 "imul{q}\t$src", []>;         // RAX,RDX = RAX*[mem64]
+                 "imul{q}\t$src", [], IIC_IMUL64>;  // RAX,RDX = RAX*[mem64]
 }
 } // neverHasSideEffects
 
@@ -639,10 +640,11 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
 
 // BinOpRR - Instructions like "add reg, reg, reg".
 class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, list<dag> pattern, Format f = MRMDestReg>
+              dag outlist, list<dag> pattern, InstrItinClass itin,
+              Format f = MRMDestReg>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>;
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>;
 
 // BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has
 // just a regclass (no eflags) as a result.
@@ -650,7 +652,8 @@ class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 SDNode opnode>
   : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+                  IIC_BIN_NONMEM>;
 
 // BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
 // just a EFLAGS as a result.
@@ -659,7 +662,7 @@ class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRR<opcode, mnemonic, typeinfo, (outs),
             [(set EFLAGS,
                   (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
-            f>;
+            IIC_BIN_NONMEM, f>;
 
 // BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
 // both a regclass and EFLAGS as a result.
@@ -667,7 +670,8 @@ class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  SDNode opnode>
   : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+                  IIC_BIN_NONMEM>;
 
 // BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
 // both a regclass and EFLAGS as a result, and has EFLAGS as input.
@@ -676,14 +680,14 @@ class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
                   (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
-                          EFLAGS))]>;
+                          EFLAGS))], IIC_BIN_NONMEM>;
 
 // BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
 class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : ITy<opcode, MRMSrcReg, typeinfo,
         (outs typeinfo.RegClass:$dst),
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", []> {
+        mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
 }
@@ -692,7 +696,7 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
 class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : ITy<opcode, MRMSrcReg, typeinfo, (outs),
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", []> {
+        mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
 }
@@ -702,7 +706,7 @@ class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               dag outlist, list<dag> pattern>
   : ITy<opcode, MRMSrcMem, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_MEM>;
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>;
 
 // BinOpRM_R - Instructions like "add reg, reg, [mem]".
 class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -738,7 +742,7 @@ class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               Format f, dag outlist, list<dag> pattern>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern> {
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM> {
   let ImmT = typeinfo.ImmEncoding;
 }
 
@@ -762,7 +766,6 @@ class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS, 
                 (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
-
 // BinOpRI_RFF - Instructions like "adc reg, reg, imm".
 class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  SDNode opnode, Format f>
@@ -776,7 +779,7 @@ class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                Format f, dag outlist, list<dag> pattern>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern> {
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
 
@@ -853,7 +856,6 @@ class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
                             typeinfo.ImmOperator:$src), addr:$dst),
              (implicit EFLAGS)]>;
-
 // BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
 class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
                   SDNode opnode, Format f>
@@ -1219,12 +1221,12 @@ let neverHasSideEffects = 1 in {
   let isCommutable = 1 in
   def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8XD, VEX_4V;
+             [], IIC_MUL8>, T8XD, VEX_4V;
 
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8XD, VEX_4V;
+             [], IIC_MUL8>, T8XD, VEX_4V;
 }
 }
 
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 42a5014..6f9e849 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -301,34 +301,67 @@ def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
 // String Pseudo Instructions
 //
 let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
-def REP_MOVSB : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
-                  [(X86rep_movs i8)], IIC_REP_MOVS>, REP;
-def REP_MOVSW : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
-                  [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize;
-def REP_MOVSD : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
-                  [(X86rep_movs i32)], IIC_REP_MOVS>, REP;
+def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[In32BitMode]>;
+def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize,
+                   Requires<[In32BitMode]>;
+def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP,
+                   Requires<[In32BitMode]>;
 }
 
-let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in
-def REP_MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
-                   [(X86rep_movs i64)], IIC_REP_MOVS>, REP;
-
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
+def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize,
+                   Requires<[In64BitMode]>;
+def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+                    [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+}
 
 // FIXME: Should use "(X86rep_stos AL)" as the pattern.
-let Defs = [ECX,EDI], Uses = [AL,ECX,EDI], isCodeGenOnly = 1 in
-def REP_STOSB : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
-                  [(X86rep_stos i8)], IIC_REP_STOS>, REP;
-let Defs = [ECX,EDI], Uses = [AX,ECX,EDI], isCodeGenOnly = 1 in
-def REP_STOSW : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
-                  [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize;
-let Defs = [ECX,EDI], Uses = [EAX,ECX,EDI], isCodeGenOnly = 1 in
-def REP_STOSD : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
-                  [(X86rep_stos i32)], IIC_REP_STOS>, REP;
-
-let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI], isCodeGenOnly = 1 in
-def REP_STOSQ : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
-                   [(X86rep_stos i64)], IIC_REP_STOS>, REP;
+let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,ECX,EDI] in
+  def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[In32BitMode]>;
+  let Uses = [AX,ECX,EDI] in
+  def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize,
+                     Requires<[In32BitMode]>;
+  let Uses = [EAX,ECX,EDI] in
+  def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP,
+                     Requires<[In32BitMode]>;
+}
 
+let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,RCX,RDI] in
+  def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+  let Uses = [AX,RCX,RDI] in
+  def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize,
+                     Requires<[In64BitMode]>;
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+ 
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+                      [(X86rep_stos i64)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+}
 
 //===----------------------------------------------------------------------===//
 // Thread Local Storage Instructions
@@ -1134,12 +1167,10 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
 
-  unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
-  APInt Mask = APInt::getAllOnesValue(BitWidth);
   APInt KnownZero0, KnownOne0;
-  CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0);
+  CurDAG->ComputeMaskedBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
   APInt KnownZero1, KnownOne1;
-  CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0);
+  CurDAG->ComputeMaskedBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
   return (~KnownZero0 & ~KnownZero1) == 0;
 }]>;
 
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index ba86098..bf11fde 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -21,20 +21,25 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
   def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret",
                     [(X86retflag 0)], IIC_RET>;
+  def RETW   : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret{w}",
+                    [], IIC_RET>, OpSize;
   def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret\t$amt",
                     [(X86retflag timm:$amt)], IIC_RET_IMM>;
   def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
-                    "retw\t$amt",
+                    "ret{w}\t$amt",
                     [], IIC_RET_IMM>, OpSize;
   def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
-                    "lretl", [], IIC_RET>;
+                    "{l}ret{l|f}", [], IIC_RET>;
+  def LRETW  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{w|f}", [], IIC_RET>, OpSize;
   def LRETQ  : RI  <0xCB, RawFrm, (outs), (ins),
-                    "lretq", [], IIC_RET>;
+                    "{l}ret{q|f}", [], IIC_RET>;
   def LRETI  : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "lret\t$amt", [], IIC_RET>;
+                    "{l}ret{l|f}\t$amt", [], IIC_RET>;
   def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
-                    "lretw\t$amt", [], IIC_RET>, OpSize;
+                    "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize;
 }
 
 // Unconditional branches.
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index ae3ed1b..35801e4 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -126,6 +126,8 @@ def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisInt<3>]>;
 
 def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
 
 def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
 
@@ -153,11 +155,17 @@ def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
 def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
 def X86VPermilp  : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
+def X86VPermv    : SDNode<"X86ISD::VPERMV",   SDTShuff2Op>;
+def X86VPermi    : SDNode<"X86ISD::VPERMI",   SDTShuff2OpI>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 
+def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>;
+def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>;
+def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>;
+
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 307c96b..b12c1db 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1049,9 +1049,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        TB_ALIGN_32 },
     { X86::VPERM2I128rr,      X86::VPERM2I128rm,       TB_ALIGN_32 },
     { X86::VPERMDYrr,         X86::VPERMDYrm,          TB_ALIGN_32 },
-    { X86::VPERMPDYrr,        X86::VPERMPDYrm,         TB_ALIGN_32 },
+    { X86::VPERMPDYri,        X86::VPERMPDYmi,         TB_ALIGN_32 },
     { X86::VPERMPSYrr,        X86::VPERMPSYrm,         TB_ALIGN_32 },
-    { X86::VPERMQYrr,         X86::VPERMQYrm,          TB_ALIGN_32 },
+    { X86::VPERMQYri,         X86::VPERMQYmi,          TB_ALIGN_32 },
     { X86::VPHADDDYrr,        X86::VPHADDDYrm,         TB_ALIGN_32 },
     { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      TB_ALIGN_32 },
     { X86::VPHADDWYrr,        X86::VPHADDWYrm,         TB_ALIGN_32 },
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index dd7cf50..6a25312 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -374,6 +374,11 @@ def SSECC : Operand<i8> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
+def AVXCC : Operand<i8> {
+  let PrintMethod = "printSSECC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
 class ImmSExtAsmOperandClass : AsmOperandClass {
   let SuperClasses = [ImmAsmOperand];
   let RenderMethod = "addImmOperands";
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index df42627..65e3c1e 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2162,15 +2162,15 @@ def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
 
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
-                            SDNode OpNode, ValueType VT, PatFrag ld_frag,
-                            string asm, string asm_alt,
+                            Operand CC, SDNode OpNode, ValueType VT, 
+                            PatFrag ld_frag, string asm, string asm_alt,
                             OpndItins itins> {
   def rr : SIi8<0xC2, MRMSrcReg,
-                (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
                 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
                 itins.rr>;
   def rm : SIi8<0xC2, MRMSrcMem,
-                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, SSECC:$cc), asm,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
                 [(set RC:$dst, (OpNode (VT RC:$src1),
                                          (ld_frag addr:$src2), imm:$cc))],
                                          itins.rm>;
@@ -2187,57 +2187,57 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
   }
 }
 
-defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32,
                  "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SSE_ALU_F32S>,
                  XS, VEX_4V, VEX_LIG;
-defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64,
                  "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SSE_ALU_F32S>, // same latency as 32 bit compare
                  XD, VEX_4V, VEX_LIG;
 
 let Constraints = "$src1 = $dst" in {
-  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32,
                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
                   "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
                   XS;
-  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                   SSE_ALU_F32S>, // same latency as 32 bit compare
                   XD;
 }
 
-multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
+multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
                          Intrinsic Int, string asm, OpndItins itins> {
   def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
-                      (ins VR128:$src1, VR128:$src, SSECC:$cc), asm,
+                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
                                                VR128:$src, imm:$cc))],
                                                itins.rr>;
   def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, x86memop:$src, SSECC:$cc), asm,
+                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
                                                (load addr:$src), imm:$cc))],
                                                itins.rm>;
 }
 
 // Aliases to match intrinsics which expect XMM operand(s).
-defm Int_VCMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
+defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
                      "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
                      SSE_ALU_F32S>,
                      XS, VEX_4V;
-defm Int_VCMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
+defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
                      "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
                      SSE_ALU_F32S>, // same latency as f32
                      XD, VEX_4V;
 let Constraints = "$src1 = $dst" in {
-  defm Int_CMPSS  : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss,
+  defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
                        "cmp${cc}ss\t{$src, $dst|$dst, $src}",
                        SSE_ALU_F32S>, XS;
-  defm Int_CMPSD  : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd,
+  defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
                        SSE_ALU_F32S>, // same latency as f32
                        XD;
@@ -2308,50 +2308,50 @@ let Defs = [EFLAGS] in {
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
-                            Intrinsic Int, string asm, string asm_alt,
-                            Domain d> {
-  let isAsmParserOnly = 1 in {
-    def rri : PIi8<0xC2, MRMSrcReg,
-               (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
-               [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
-               IIC_SSE_CMPP_RR, d>;
-    def rmi : PIi8<0xC2, MRMSrcMem,
-               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, SSECC:$cc), asm,
-               [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
-               IIC_SSE_CMPP_RM, d>;
-  }
+                            Operand CC, Intrinsic Int, string asm, 
+                            string asm_alt, Domain d> {
+  def rri : PIi8<0xC2, MRMSrcReg,
+             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
+             IIC_SSE_CMPP_RR, d>;
+  def rmi : PIi8<0xC2, MRMSrcMem,
+             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
+             IIC_SSE_CMPP_RM, d>;
 
   // Accept explicit immediate argument form instead of comparison code.
-  def rri_alt : PIi8<0xC2, MRMSrcReg,
-             (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
-             asm_alt, [], IIC_SSE_CMPP_RR, d>;
-  def rmi_alt : PIi8<0xC2, MRMSrcMem,
-             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
-             asm_alt, [], IIC_SSE_CMPP_RM, d>;
+  let neverHasSideEffects = 1 in {
+    def rri_alt : PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_CMPP_RR, d>;
+    def rmi_alt : PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_CMPP_RM, d>;
+  }
 }
 
-defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedSingle>, TB, VEX_4V;
-defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedDouble>, TB, OpSize, VEX_4V;
-defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256,
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedSingle>, TB, VEX_4V;
-defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256,
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedDouble>, TB, OpSize, VEX_4V;
 let Constraints = "$src1 = $dst" in {
-  defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
+  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedSingle>, TB;
-  defm CMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
+  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedDouble>, TB, OpSize;
@@ -6331,11 +6331,11 @@ def : Pat<(f64 (ftrunc FR64:$src)),
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>,
+                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
                 OpSize, VEX;
 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS,(X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>,
+                [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
                 OpSize, VEX;
 
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
@@ -6351,11 +6351,11 @@ def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
 let Defs = [EFLAGS] in {
 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
               "ptest\t{$src2, $src1|$src1, $src2}",
-              [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>,
+              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
               OpSize;
 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
               "ptest\t{$src2, $src1|$src1, $src2}",
-              [(set EFLAGS, (X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>,
+              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
               OpSize;
 }
 
@@ -6735,12 +6735,32 @@ let Predicates = [HasAVX] in {
   def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
                             (v4f64 VR256:$src2))),
             (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+
+  def : Pat<(v8f32 (X86Blendps (v8f32 VR256:$src1), (v8f32 VR256:$src2),
+                               (imm:$mask))),
+            (VBLENDPSYrri VR256:$src2, VR256:$src1, imm:$mask)>;
+  def : Pat<(v4f64 (X86Blendpd (v4f64 VR256:$src1), (v4f64 VR256:$src2),
+                               (imm:$mask))),
+            (VBLENDPDYrri VR256:$src2, VR256:$src1, imm:$mask)>;
+
+  def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+                               (imm:$mask))),
+            (VPBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>;
+  def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+                               (imm:$mask))),
+            (VBLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>;
+  def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+                               (imm:$mask))),
+            (VBLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>;
 }
 
 let Predicates = [HasAVX2] in {
   def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
                             (v32i8 VR256:$src2))),
             (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v16i16 (X86Blendpw (v16i16 VR256:$src1), (v16i16 VR256:$src2),
+                               (imm:$mask))),
+            (VPBLENDWYrri VR256:$src2, VR256:$src1, imm:$mask)>;
 }
 
 /// SS41I_ternary_int - SSE 4.1 ternary operator
@@ -6789,6 +6809,17 @@ let Predicates = [HasSSE41] in {
   def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
                             (v2f64 VR128:$src2))),
             (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+
+  def : Pat<(v8i16 (X86Blendpw (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+                               (imm:$mask))),
+            (PBLENDWrri VR128:$src2, VR128:$src1, imm:$mask)>;
+  def : Pat<(v4f32 (X86Blendps (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+                               (imm:$mask))),
+            (BLENDPSrri VR128:$src2, VR128:$src1, imm:$mask)>;
+  def : Pat<(v2f64 (X86Blendpd (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+                               (imm:$mask))),
+            (BLENDPDrri VR128:$src2, VR128:$src1, imm:$mask)>;
+
 }
 
 let Predicates = [HasAVX] in
@@ -7294,6 +7325,46 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           []>, VEX_4V;
 }
 
+let Predicates = [HasAVX] in {
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
 //===----------------------------------------------------------------------===//
 // VEXTRACTF128 - Extract packed floating-point values
 //
@@ -7664,45 +7735,47 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
 //
 
 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                     Intrinsic Int> {
+                     ValueType OpVT> {
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2),
                    !strconcat(OpcodeStr,
                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   [(set VR256:$dst, (Int VR256:$src1, VR256:$src2))]>, VEX_4V;
+                   [(set VR256:$dst,
+                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, VEX_4V;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, i256mem:$src2),
                    !strconcat(OpcodeStr,
                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   [(set VR256:$dst, (Int VR256:$src1,
-                                      (bitconvert (mem_frag addr:$src2))))]>,
+                   [(set VR256:$dst,
+                     (OpVT (X86VPermv VR256:$src1,
+                            (bitconvert (mem_frag addr:$src2)))))]>,
                    VEX_4V;
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, int_x86_avx2_permd>;
+defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, int_x86_avx2_permps>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, v8f32>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                         Intrinsic Int> {
-  def Yrr : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+                         ValueType OpVT> {
+  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
                      (ins VR256:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set VR256:$dst, (Int VR256:$src1, imm:$src2))]>, VEX;
-  def Yrm : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, VEX;
+  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
                      (ins i256mem:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set VR256:$dst, (Int (mem_frag addr:$src1), imm:$src2))]>,
-                     VEX;
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermi (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, VEX;
 }
 
-defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, int_x86_avx2_permq>,
-                            VEX_W;
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
 let ExeDomain = SSEPackedDouble in
-defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, int_x86_avx2_permpd>,
-                             VEX_W;
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
@@ -7743,18 +7816,17 @@ def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
 //===----------------------------------------------------------------------===//
 // VINSERTI128 - Insert packed integer values
 //
+let neverHasSideEffects = 1 in {
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst,
-            (int_x86_avx2_vinserti128 VR256:$src1, VR128:$src2, imm:$src3))]>,
+          []>,
           VEX_4V;
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst,
-            (int_x86_avx2_vinserti128 VR256:$src1, (memopv2i64 addr:$src2),
-             imm:$src3))]>, VEX_4V;
+          []>, VEX_4V;
+}
 
 let Predicates = [HasAVX2], AddedComplexity = 1 in {
 def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
@@ -7775,47 +7847,6 @@ def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
 
-// AVX1 patterns
-let Predicates = [HasAVX] in {
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-}
-
 //===----------------------------------------------------------------------===//
 // VEXTRACTI128 - Extract packed integer values
 //
@@ -7830,7 +7861,7 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX;
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2], AddedComplexity = 1 in {
 def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
           (v2i64 (VEXTRACTI128rr
                     (v4i64 VR256:$src1),
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 3eb9441..ed1a409 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/SmallVector.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -425,7 +424,9 @@ bool X86Subtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  //TODO: change back to ANTIDEP_CRITICAL when the
+  // X86 subtarget properly sets up post RA liveness.
+  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
   CriticalPathRCs.clear();
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index c0d2a9c..718f35e 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -14,7 +14,6 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Dwarf.h"
 using namespace llvm;
 using namespace dwarf;
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index 64f1a8e..1cfdbda 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "XCoreMCAsmInfo.h"
+#include "llvm/ADT/StringRef.h"
 using namespace llvm;
 
 void XCoreMCAsmInfo::anchor() { }
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index 24e170a..0767775 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -14,10 +14,10 @@
 #ifndef XCORETARGETASMINFO_H
 #define XCORETARGETASMINFO_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
+  class StringRef;
   class Target;
 
   class XCoreMCAsmInfo : public MCAsmInfo {
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
index 3cfc376..a255adb 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -15,9 +15,7 @@
 #define XCOREMCTARGETDESC_H
 
 namespace llvm {
-class MCSubtargetInfo;
 class Target;
-class StringRef;
 
 extern Target TheXCoreTarget;
 
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 593cebc..fdf2b78 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -1363,8 +1363,8 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.ComputeMaskedBits(N2, Mask, KnownZero, KnownOne);
-      if (KnownZero == Mask) {
+      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
         SDValue Carry = DAG.getConstant(0, VT);
         SDValue Result = DAG.getNode(ISD::ADD, dl, VT, N0, N2);
         SDValue Ops [] = { Carry, Result };
@@ -1386,8 +1386,8 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.ComputeMaskedBits(N2, Mask, KnownZero, KnownOne);
-      if (KnownZero == Mask) {
+      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
         SDValue Borrow = N2;
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT,
                                      DAG.getConstant(0, VT), N2);
@@ -1402,8 +1402,8 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.ComputeMaskedBits(N2, Mask, KnownZero, KnownOne);
-      if (KnownZero == Mask) {
+      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
         SDValue Borrow = DAG.getConstant(0, VT);
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT, N0, N2);
         SDValue Ops [] = { Borrow, Result };
@@ -1521,21 +1521,19 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
 }
 
 void XCoreTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                         const APInt &Mask,
                                                          APInt &KnownZero,
                                                          APInt &KnownOne,
                                                          const SelectionDAG &DAG,
                                                          unsigned Depth) const {
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
   switch (Op.getOpcode()) {
   default: break;
   case XCoreISD::LADD:
   case XCoreISD::LSUB:
     if (Op.getResNo() == 0) {
       // Top bits of carry / borrow are clear.
-      KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(),
-                                        Mask.getBitWidth() - 1);
-      KnownZero &= Mask;
+      KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                        KnownZero.getBitWidth() - 1);
     }
     break;
   }
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 5cd3e67..0b63ecd 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -160,7 +160,6 @@ namespace llvm {
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
     virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                const APInt &Mask,
                                                 APInt &KnownZero,
                                                 APInt &KnownOne,
                                                 const SelectionDAG &DAG,
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index a32e550..1522aa4 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -341,6 +341,12 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
           dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, TD, TLI));
         if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
           SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+
+        // If the initializer is an all-null value and we have an inbounds GEP,
+        // we already know what the result of any load from that GEP is.
+        // TODO: Handle splats.
+        if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
+          SubInit = Constant::getNullValue(GEP->getType()->getElementType());
       }
       Changed |= CleanupConstantGlobalUsers(GEP, SubInit, TD, TLI);
 
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 3c7fac6..664ddf6 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -32,7 +32,6 @@ namespace {
 
   // AlwaysInliner only inlines functions that are mark as "always inline".
   class AlwaysInliner : public Inliner {
-    InlineCostAnalyzer CA;
   public:
     // Use extremely low threshold.
     AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/true) {
@@ -43,40 +42,11 @@ namespace {
       initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
     }
     static char ID; // Pass identification, replacement for typeid
-    InlineCost getInlineCost(CallSite CS) {
-      Function *Callee = CS.getCalledFunction();
-      // We assume indirect calls aren't calling an always-inline function.
-      if (!Callee) return InlineCost::getNever();
-
-      // We can't inline calls to external functions.
-      // FIXME: We shouldn't even get here.
-      if (Callee->isDeclaration()) return InlineCost::getNever();
-
-      // Return never for anything not marked as always inline.
-      if (!Callee->hasFnAttr(Attribute::AlwaysInline))
-        return InlineCost::getNever();
-
-      // We still have to check the inline cost in case there are reasons to
-      // not inline which trump the always-inline attribute such as setjmp and
-      // indirectbr.
-      return CA.getInlineCost(CS);
-    }
-    float getInlineFudgeFactor(CallSite CS) {
-      return CA.getInlineFudgeFactor(CS);
-    }
-    void resetCachedCostInfo(Function *Caller) {
-      CA.resetCachedCostInfo(Caller);
-    }
-    void growCachedCostInfo(Function* Caller, Function* Callee) {
-      CA.growCachedCostInfo(Caller, Callee);
-    }
+    virtual InlineCost getInlineCost(CallSite CS);
     virtual bool doFinalization(CallGraph &CG) {
       return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
     }
     virtual bool doInitialization(CallGraph &CG);
-    void releaseMemory() {
-      CA.clear();
-    }
   };
 }
 
@@ -93,9 +63,70 @@ Pass *llvm::createAlwaysInlinerPass(bool InsertLifetime) {
   return new AlwaysInliner(InsertLifetime);
 }
 
+/// \brief Minimal filter to detect invalid constructs for inlining.
+static bool isInlineViable(Function &F) {
+  bool ReturnsTwice = F.hasFnAttr(Attribute::ReturnsTwice);
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    // Disallow inlining of functions which contain an indirect branch.
+    if (isa<IndirectBrInst>(BI->getTerminator()))
+      return false;
+
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+         ++II) {
+      CallSite CS(II);
+      if (!CS)
+        continue;
+
+      // Disallow recursive calls.
+      if (&F == CS.getCalledFunction())
+        return false;
+
+      // Disallow calls which expose returns-twice to a function not previously
+      // attributed as such.
+      if (!ReturnsTwice && CS.isCall() &&
+          cast<CallInst>(CS.getInstruction())->canReturnTwice())
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// \brief Get the inline cost for the always-inliner.
+///
+/// The always inliner *only* handles functions which are marked with the
+/// attribute to force inlining. As such, it is dramatically simpler and avoids
+/// using the powerful (but expensive) inline cost analysis. Instead it uses
+/// a very simple and boring direct walk of the instructions looking for
+/// impossible-to-inline constructs.
+///
+/// Note, it would be possible to go to some lengths to cache the information
+/// computed here, but as we only expect to do this for relatively few and
+/// small functions which have the explicit attribute to force inlining, it is
+/// likely not worth it in practice.
+InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
+  Function *Callee = CS.getCalledFunction();
+  // We assume indirect calls aren't calling an always-inline function.
+  if (!Callee) return InlineCost::getNever();
+
+  // We can't inline calls to external functions.
+  // FIXME: We shouldn't even get here.
+  if (Callee->isDeclaration()) return InlineCost::getNever();
+
+  // Return never for anything not marked as always inline.
+  if (!Callee->hasFnAttr(Attribute::AlwaysInline))
+    return InlineCost::getNever();
+
+  // Do some minimal analysis to preclude non-viable functions.
+  if (!isInlineViable(*Callee))
+    return InlineCost::getNever();
+
+  // Otherwise, force inlining.
+  return InlineCost::getAlways();
+}
+
 // doInitialization - Initializes the vector of functions that have not
 // been annotated with the "always inline" attribute.
 bool AlwaysInliner::doInitialization(CallGraph &CG) {
-  CA.setTargetData(getAnalysisIfAvailable<TargetData>());
   return false;
 }
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 03032e6..50038d8 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -40,21 +40,9 @@ namespace {
     }
     static char ID; // Pass identification, replacement for typeid
     InlineCost getInlineCost(CallSite CS) {
-      return CA.getInlineCost(CS);
-    }
-    float getInlineFudgeFactor(CallSite CS) {
-      return CA.getInlineFudgeFactor(CS);
-    }
-    void resetCachedCostInfo(Function *Caller) {
-      CA.resetCachedCostInfo(Caller);
-    }
-    void growCachedCostInfo(Function* Caller, Function* Callee) {
-      CA.growCachedCostInfo(Caller, Callee);
+      return CA.getInlineCost(CS, getInlineThreshold(CS));
     }
     virtual bool doInitialization(CallGraph &CG);
-    void releaseMemory() {
-      CA.clear();
-    }
   };
 }
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 9975333..dc9cbfb 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -19,7 +19,6 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -37,6 +36,11 @@ STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
 STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
 STATISTIC(NumMergedAllocas, "Number of allocas merged together");
 
+// This weirdly named statistic tracks the number of times that, when attemting
+// to inline a function A into B, we analyze the callers of B in order to see
+// if those would be more profitable and blocked inline steps.
+STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
+
 static cl::opt<int>
 InlineLimit("inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
         cl::desc("Control the amount of inlining to perform (default = 225)"));
@@ -232,14 +236,10 @@ bool Inliner::shouldInline(CallSite CS) {
     return false;
   }
   
-  int Cost = IC.getValue();
   Function *Caller = CS.getCaller();
-  int CurrentThreshold = getInlineThreshold(CS);
-  float FudgeFactor = getInlineFudgeFactor(CS);
-  int AdjThreshold = (int)(CurrentThreshold * FudgeFactor);
-  if (Cost >= AdjThreshold) {
-    DEBUG(dbgs() << "    NOT Inlining: cost=" << Cost
-          << ", thres=" << AdjThreshold
+  if (!IC) {
+    DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
+          << ", thres=" << (IC.getCostDelta() + IC.getCost())
           << ", Call: " << *CS.getInstruction() << "\n");
     return false;
   }
@@ -256,12 +256,17 @@ bool Inliner::shouldInline(CallSite CS) {
   // are used. Thus we will always have the opportunity to make local inlining
   // decisions. Importantly the linkonce-ODR linkage covers inline functions
   // and templates in C++.
+  //
+  // FIXME: All of this logic should be sunk into getInlineCost. It relies on
+  // the internal implementation of the inline cost metrics rather than
+  // treating them as truly abstract units etc.
   if (Caller->hasLocalLinkage() ||
       Caller->getLinkage() == GlobalValue::LinkOnceODRLinkage) {
     int TotalSecondaryCost = 0;
-    bool outerCallsFound = false;
+    // The candidate cost to be imposed upon the current function.
+    int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1);
     // This bool tracks what happens if we do NOT inline C into B.
-    bool callerWillBeRemoved = true;
+    bool callerWillBeRemoved = Caller->hasLocalLinkage();
     // This bool tracks what happens if we DO inline C into B.
     bool inliningPreventsSomeOuterInline = false;
     for (Value::use_iterator I = Caller->use_begin(), E =Caller->use_end(); 
@@ -277,26 +282,20 @@ bool Inliner::shouldInline(CallSite CS) {
       }
 
       InlineCost IC2 = getInlineCost(CS2);
-      if (IC2.isNever())
+      ++NumCallerCallersAnalyzed;
+      if (!IC2) {
         callerWillBeRemoved = false;
-      if (IC2.isAlways() || IC2.isNever())
+        continue;
+      }
+      if (IC2.isAlways())
         continue;
 
-      outerCallsFound = true;
-      int Cost2 = IC2.getValue();
-      int CurrentThreshold2 = getInlineThreshold(CS2);
-      float FudgeFactor2 = getInlineFudgeFactor(CS2);
-
-      if (Cost2 >= (int)(CurrentThreshold2 * FudgeFactor2))
-        callerWillBeRemoved = false;
-
-      // See if we have this case.  We subtract off the penalty
-      // for the call instruction, which we would be deleting.
-      if (Cost2 < (int)(CurrentThreshold2 * FudgeFactor2) &&
-          Cost2 + Cost - (InlineConstants::CallPenalty + 1) >= 
-                (int)(CurrentThreshold2 * FudgeFactor2)) {
+      // See if inlining or original callsite would erase the cost delta of
+      // this callsite. We subtract off the penalty for the call instruction,
+      // which we would be deleting.
+      if (IC2.getCostDelta() <= CandidateCost) {
         inliningPreventsSomeOuterInline = true;
-        TotalSecondaryCost += Cost2;
+        TotalSecondaryCost += IC2.getCost();
       }
     }
     // If all outer calls to Caller would get inlined, the cost for the last
@@ -306,17 +305,16 @@ bool Inliner::shouldInline(CallSite CS) {
     if (callerWillBeRemoved && Caller->use_begin() != Caller->use_end())
       TotalSecondaryCost += InlineConstants::LastCallToStaticBonus;
 
-    if (outerCallsFound && inliningPreventsSomeOuterInline &&
-        TotalSecondaryCost < Cost) {
-      DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction() << 
-           " Cost = " << Cost << 
+    if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) {
+      DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction() <<
+           " Cost = " << IC.getCost() <<
            ", outer Cost = " << TotalSecondaryCost << '\n');
       return false;
     }
   }
 
-  DEBUG(dbgs() << "    Inlining: cost=" << Cost
-        << ", thres=" << AdjThreshold
+  DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
+        << ", thres=" << (IC.getCostDelta() + IC.getCost())
         << ", Call: " << *CS.getInstruction() << '\n');
   return true;
 }
@@ -335,38 +333,6 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
   return false;
 }
 
-/// \brief Simplify arguments going into a particular callsite.
-///
-/// This is important to do each time we add a callsite due to inlining so that
-/// constants and other entities which feed into inline cost estimation are
-/// properly recognized when analyzing the new callsite. Consider:
-///   void outer(int x) {
-///     if (x < 42)
-///       return inner(42 - x);
-///     ...
-///   }
-///   void inner(int x) {
-///     ...
-///   }
-///
-/// The inliner gives calls to 'outer' with a constant argument a bonus because
-/// it will delete one side of a branch. But the resulting call to 'inner'
-/// will, after inlining, also have a constant operand. We need to do just
-/// enough constant folding to expose this for callsite arguments. The rest
-/// will be taken care of after the inliner finishes running.
-static void simplifyCallSiteArguments(const TargetData *TD, CallSite CS) {
-  // FIXME: It would be nice to avoid this smallvector if RAUW doesn't
-  // invalidate operand iterators in any cases.
-  SmallVector<std::pair<Value *, Value*>, 4> SimplifiedArgs;
-  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
-       I != E; ++I)
-    if (Instruction *Inst = dyn_cast<Instruction>(*I))
-      if (Value *SimpleArg = SimplifyInstruction(Inst, TD))
-        SimplifiedArgs.push_back(std::make_pair(Inst, SimpleArg));
-  for (unsigned Idx = 0, Size = SimplifiedArgs.size(); Idx != Size; ++Idx)
-    SimplifiedArgs[Idx].first->replaceAllUsesWith(SimplifiedArgs[Idx].second);
-}
-
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraph>();
   const TargetData *TD = getAnalysisIfAvailable<TargetData>();
@@ -455,8 +421,6 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
         CG[Caller]->removeCallEdgeFor(CS);
         CS.getInstruction()->eraseFromParent();
         ++NumCallsDeleted;
-        // Update the cached cost info with the missing call
-        growCachedCostInfo(Caller, NULL);
       } else {
         // We can only inline direct calls to non-declarations.
         if (Callee == 0 || Callee->isDeclaration()) continue;
@@ -494,14 +458,9 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
           for (unsigned i = 0, e = InlineInfo.InlinedCalls.size();
                i != e; ++i) {
             Value *Ptr = InlineInfo.InlinedCalls[i];
-            CallSite NewCS = Ptr;
-            simplifyCallSiteArguments(TD, NewCS);
-            CallSites.push_back(std::make_pair(NewCS, NewHistoryID));
+            CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID));
           }
         }
-        
-        // Update the cached cost info with the inlined call.
-        growCachedCostInfo(Caller, Callee);
       }
       
       // If we inlined or deleted the last possible call site to the function,
@@ -521,8 +480,6 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
         // Remove any call graph edges from the callee to its callees.
         CalleeNode->removeAllCalledFunctions();
         
-        resetCachedCostInfo(Callee);
-        
         // Removing the node for callee from the call graph and delete it.
         delete CG.removeFunctionFromModule(CalleeNode);
         ++NumDeleted;
@@ -601,14 +558,13 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
   // Note that it doesn't matter that we are iterating over a non-stable order
   // here to do this, it doesn't matter which order the functions are deleted
   // in.
-  std::sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
+  array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
   FunctionsToRemove.erase(std::unique(FunctionsToRemove.begin(),
                                       FunctionsToRemove.end()),
                           FunctionsToRemove.end());
   for (SmallVectorImpl<CallGraphNode *>::iterator I = FunctionsToRemove.begin(),
                                                   E = FunctionsToRemove.end();
        I != E; ++I) {
-    resetCachedCostInfo((*I)->getFunction());
     delete CG.removeFunctionFromModule(*I);
     ++NumDeleted;
   }
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index 7cb1d18..fb5869e 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -122,6 +122,11 @@ bool InternalizePass::runOnModule(Module &M) {
 
   bool Changed = false;
 
+  // Never internalize functions which code-gen might insert.
+  // FIXME: We should probably add this (and the __stack_chk_guard) via some
+  // type of call-back in CodeGen.
+  ExternalNames.insert("__stack_chk_fail");
+
   // Mark all functions not in the api as internal.
   // FIXME: maybe use private linkage?
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
@@ -148,9 +153,11 @@ bool InternalizePass::runOnModule(Module &M) {
   // won't find them.  (see MachineModuleInfo.)
   ExternalNames.insert("llvm.global_ctors");
   ExternalNames.insert("llvm.global_dtors");
-  ExternalNames.insert("llvm.noinline");
   ExternalNames.insert("llvm.global.annotations");
 
+  // Never internalize symbols code-gen inserts.
+  ExternalNames.insert("__stack_chk_guard");
+
   // Mark all global variables with initializers that are not in the api as
   // internal as well.
   // FIXME: maybe use private linkage?
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 8408437..43b4ab5 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -35,6 +35,11 @@ using namespace llvm;
 static cl::opt<bool>
 RunVectorization("vectorize", cl::desc("Run vectorization passes"));
 
+static cl::opt<bool>
+UseGVNAfterVectorization("use-gvn-after-vectorization",
+  cl::init(false), cl::Hidden,
+  cl::desc("Run GVN instead of Early CSE after vectorization passes"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -182,8 +187,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   if (Vectorize) {
     MPM.add(createBBVectorizePass());
     MPM.add(createInstructionCombiningPass());
-    if (OptLevel > 1)
-      MPM.add(createGVNPass());                 // Remove redundancies
+    if (OptLevel > 1 && UseGVNAfterVectorization)
+      MPM.add(createGVNPass());                   // Remove redundancies
+    else
+      MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
   }
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
@@ -202,11 +209,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
     if (OptLevel > 1)
       MPM.add(createConstantMergePass());     // Merge dup global constants
   }
+  addExtensionsToPM(EP_OptimizerLast, MPM);
 }
 
 void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
                                                 bool Internalize,
-                                                bool RunInliner) {
+                                                bool RunInliner,
+                                                bool DisableGVNLoadPRE) {
   // Provide AliasAnalysis services for optimizations.
   addInitialAliasAnalysisPasses(PM);
 
@@ -262,9 +271,9 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createFunctionAttrsPass()); // Add nocapture.
   PM.add(createGlobalsModRefPass()); // IP alias analysis.
 
-  PM.add(createLICMPass());      // Hoist loop invariants.
-  PM.add(createGVNPass());       // Remove redundancies.
-  PM.add(createMemCpyOptPass()); // Remove dead memcpys.
+  PM.add(createLICMPass());                 // Hoist loop invariants.
+  PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
+  PM.add(createMemCpyOptPass());            // Remove dead memcpys.
   // Nuke dead stores.
   PM.add(createDeadStoreEliminationPass());
 
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 464e9d0..199df51 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -291,9 +291,9 @@ public:
     return 0;  // Don't do anything with FI
   }
       
-  void ComputeMaskedBits(Value *V, const APInt &Mask, APInt &KnownZero,
+  void ComputeMaskedBits(Value *V, APInt &KnownZero,
                          APInt &KnownOne, unsigned Depth = 0) const {
-    return llvm::ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth);
+    return llvm::ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
   }
   
   bool MaskedValueIsZero(Value *V, const APInt &Mask, 
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 908038b..05e702f 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -141,10 +141,9 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       // a sub and fuse this add with it.
       if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) {
         IntegerType *IT = cast<IntegerType>(I.getType());
-        APInt Mask = APInt::getAllOnesValue(IT->getBitWidth());
         APInt LHSKnownOne(IT->getBitWidth(), 0);
         APInt LHSKnownZero(IT->getBitWidth(), 0);
-        ComputeMaskedBits(XorLHS, Mask, LHSKnownZero, LHSKnownOne);
+        ComputeMaskedBits(XorLHS, LHSKnownZero, LHSKnownOne);
         if ((XorRHS->getValue() | LHSKnownZero).isAllOnesValue())
           return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
                                            XorLHS);
@@ -202,14 +201,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
   // A+B --> A|B iff A and B have no bits set in common.
   if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
-    APInt Mask = APInt::getAllOnesValue(IT->getBitWidth());
     APInt LHSKnownOne(IT->getBitWidth(), 0);
     APInt LHSKnownZero(IT->getBitWidth(), 0);
-    ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne);
+    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
     if (LHSKnownZero != 0) {
       APInt RHSKnownOne(IT->getBitWidth(), 0);
       APInt RHSKnownZero(IT->getBitWidth(), 0);
-      ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne);
+      ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
       
       // No bits in common -> bitwise or.
       if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 1165660..0dbe11d 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1362,13 +1362,8 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
   // part of the value (e.g. byte 3) then it must be shifted right.  If from the
   // low part, it must be shifted left.
   unsigned DestByteNo = InputByteNo + OverallLeftShift;
-  if (InputByteNo < ByteValues.size()/2) {
-    if (ByteValues.size()-1-DestByteNo != InputByteNo)
-      return true;
-  } else {
-    if (ByteValues.size()-1-DestByteNo != InputByteNo)
-      return true;
-  }
+  if (ByteValues.size()-1-DestByteNo != InputByteNo)
+    return true;
   
   // If the destination byte value is already defined, the values are or'd
   // together, which isn't a bswap (unless it's an or of the same bits).
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index b550fe8..77e4727 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -361,8 +361,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth),
-                      KnownZero, KnownOne);
+    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
     unsigned TrailingZeros = KnownOne.countTrailingZeros();
     APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
     if ((Mask & KnownZero) == Mask)
@@ -380,8 +379,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth),
-                      KnownZero, KnownOne);
+    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
     unsigned LeadingZeros = KnownOne.countLeadingZeros();
     APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
     if ((Mask & KnownZero) == Mask)
@@ -394,17 +392,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
     IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
     uint32_t BitWidth = IT->getBitWidth();
-    APInt Mask = APInt::getSignBit(BitWidth);
     APInt LHSKnownZero(BitWidth, 0);
     APInt LHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne);
+    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
     bool LHSKnownNegative = LHSKnownOne[BitWidth - 1];
     bool LHSKnownPositive = LHSKnownZero[BitWidth - 1];
 
     if (LHSKnownNegative || LHSKnownPositive) {
       APInt RHSKnownZero(BitWidth, 0);
       APInt RHSKnownOne(BitWidth, 0);
-      ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne);
+      ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
       bool RHSKnownNegative = RHSKnownOne[BitWidth - 1];
       bool RHSKnownPositive = RHSKnownZero[BitWidth - 1];
       if (LHSKnownNegative && RHSKnownNegative) {
@@ -488,14 +485,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::umul_with_overflow: {
     Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
     unsigned BitWidth = cast<IntegerType>(LHS->getType())->getBitWidth();
-    APInt Mask = APInt::getAllOnesValue(BitWidth);
 
     APInt LHSKnownZero(BitWidth, 0);
     APInt LHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne);
+    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
     APInt RHSKnownZero(BitWidth, 0);
     APInt RHSKnownOne(BitWidth, 0);
-    ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne);
+    ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
 
     // Get the largest possible values for each operand.
     APInt LHSMax = ~LHSKnownZero;
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index c5ddb75..39279f4 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -541,8 +541,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
       // If Op1C some other power of two, convert:
       uint32_t BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      APInt TypeMask(APInt::getAllOnesValue(BitWidth));
-      ComputeMaskedBits(ICI->getOperand(0), TypeMask, KnownZero, KnownOne);
+      ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne);
         
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
@@ -590,9 +589,8 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
 
       APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0);
       APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0);
-      APInt TypeMask(APInt::getAllOnesValue(BitWidth));
-      ComputeMaskedBits(LHS, TypeMask, KnownZeroLHS, KnownOneLHS);
-      ComputeMaskedBits(RHS, TypeMask, KnownZeroRHS, KnownOneRHS);
+      ComputeMaskedBits(LHS, KnownZeroLHS, KnownOneLHS);
+      ComputeMaskedBits(RHS, KnownZeroRHS, KnownOneRHS);
 
       if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) {
         APInt KnownBits = KnownZeroLHS | KnownOneLHS;
@@ -911,8 +909,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
         ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
       unsigned BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      APInt TypeMask(APInt::getAllOnesValue(BitWidth));
-      ComputeMaskedBits(Op0, TypeMask, KnownZero, KnownOne);
+      ComputeMaskedBits(Op0, KnownZero, KnownOne);
 
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) {
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 5308992..ab2987f 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1028,9 +1028,8 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       // of the high bits truncated out of x are known.
       unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(),
              SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
-      APInt Mask(APInt::getHighBitsSet(SrcBits, SrcBits-DstBits));
       APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
-      ComputeMaskedBits(LHSI->getOperand(0), Mask, KnownZero, KnownOne);
+      ComputeMaskedBits(LHSI->getOperand(0), KnownZero, KnownOne);
 
       // If all the high bits are known, we can do this xform.
       if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 7446a51..b2f2e24 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -22,6 +22,72 @@ using namespace llvm;
 
 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
 
+// Try to kill dead allocas by walking through its uses until we see some use
+// that could escape. This is a conservative analysis which tries to handle
+// GEPs, bitcasts, stores, and no-op intrinsics. These tend to be the things
+// left after inlining and SROA finish chewing on an alloca.
+static Instruction *removeDeadAlloca(InstCombiner &IC, AllocaInst &AI) {
+  SmallVector<Instruction *, 4> Worklist, DeadStores;
+  Worklist.push_back(&AI);
+  do {
+    Instruction *PI = Worklist.pop_back_val();
+    for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end();
+         UI != UE; ++UI) {
+      Instruction *I = cast<Instruction>(*UI);
+      switch (I->getOpcode()) {
+      default:
+        // Give up the moment we see something we can't handle.
+        return 0;
+
+      case Instruction::GetElementPtr:
+      case Instruction::BitCast:
+        Worklist.push_back(I);
+        continue;
+
+      case Instruction::Call:
+        // We can handle a limited subset of calls to no-op intrinsics.
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          switch (II->getIntrinsicID()) {
+          case Intrinsic::dbg_declare:
+          case Intrinsic::dbg_value:
+          case Intrinsic::invariant_start:
+          case Intrinsic::invariant_end:
+          case Intrinsic::lifetime_start:
+          case Intrinsic::lifetime_end:
+            continue;
+          default:
+            return 0;
+          }
+        }
+        // Reject everything else.
+        return 0;
+
+      case Instruction::Store: {
+        // Stores into the alloca are only live if the alloca is live.
+        StoreInst *SI = cast<StoreInst>(I);
+        // We can eliminate atomic stores, but not volatile.
+        if (SI->isVolatile())
+          return 0;
+        // The store is only trivially safe if the poniter is the destination
+        // as opposed to the value. We're conservative here and don't check for
+        // the case where we store the address of a dead alloca into a dead
+        // alloca.
+        if (SI->getPointerOperand() != PI)
+          return 0;
+        DeadStores.push_back(I);
+        continue;
+      }
+      }
+    }
+  } while (!Worklist.empty());
+
+  // The alloca is dead. Kill off all the stores to it, and then replace it
+  // with undef.
+  while (!DeadStores.empty())
+    IC.EraseInstFromFunction(*DeadStores.pop_back_val());
+  return IC.ReplaceInstUsesWith(AI, UndefValue::get(AI.getType()));
+}
+
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
@@ -81,7 +147,10 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
   }
 
-  return 0;
+  // Try to aggressively remove allocas which are only used for GEPs, lifetime
+  // markers, and stores. This happens when SROA iteratively promotes stores
+  // out of the alloca, and we need to cleanup after it.
+  return removeDeadAlloca(*this, AI);
 }
 
 
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 61a8e5b..125c74a 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -142,7 +142,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
-    ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
     return 0;        // Only analyze instructions.
   }
 
@@ -156,10 +156,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // this instruction has a simpler value in that context.
     if (I->getOpcode() == Instruction::And) {
       // If either the LHS or the RHS are Zero, the result is zero.
-      ComputeMaskedBits(I->getOperand(1), DemandedMask,
-                        RHSKnownZero, RHSKnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero,
-                        LHSKnownZero, LHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
       
       // If all of the demanded bits are known 1 on one side, return the other.
       // These bits cannot contribute to the result of the 'and' in this
@@ -180,10 +178,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // only bits from X or Y are demanded.
       
       // If either the LHS or the RHS are One, the result is One.
-      ComputeMaskedBits(I->getOperand(1), DemandedMask, 
-                        RHSKnownZero, RHSKnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, 
-                        LHSKnownZero, LHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
       
       // If all of the demanded bits are known zero on one side, return the
       // other.  These bits cannot contribute to the result of the 'or' in this
@@ -206,7 +202,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     }
     
     // Compute the KnownZero/KnownOne bits to simplify things downstream.
-    ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
     return 0;
   }
   
@@ -219,7 +215,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   
   switch (I->getOpcode()) {
   default:
-    ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
     break;
   case Instruction::And:
     // If either the LHS or the RHS are Zero, the result is zero.
@@ -570,7 +566,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
     // Otherwise just hand the sub off to ComputeMaskedBits to fill in
     // the known zeros and ones.
-    ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
 
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
     // zero.
@@ -729,10 +725,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // The sign bit is the LHS's sign bit, except when the result of the
     // remainder is zero.
     if (DemandedMask.isNegative() && KnownZero.isNonNegative()) {
-      APInt Mask2 = APInt::getSignBit(BitWidth);
       APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      ComputeMaskedBits(I->getOperand(0), Mask2, LHSKnownZero, LHSKnownOne,
-                        Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
         KnownZero |= LHSKnownZero;
@@ -795,7 +789,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         return 0;
       }
     }
-    ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
+    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
     break;
   }
   
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 349ba83..066b2ec 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -916,8 +916,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
   Value *StrippedPtr = PtrOp->stripPointerCasts();
   PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType());
-  // We do not handle pointer-vector geps here
-  if (!StrippedPtr)
+
+  // We do not handle pointer-vector geps here.
+  if (!StrippedPtrTy)
     return 0;
 
   if (StrippedPtr != PtrOp &&
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 85fda30..8bb337e 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -22,16 +22,20 @@
 #define DEBUG_TYPE "tsan"
 
 #include "FunctionBlackList.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
 #include "llvm/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -42,16 +46,36 @@ using namespace llvm;
 static cl::opt<std::string>  ClBlackListFile("tsan-blacklist",
        cl::desc("Blacklist file"), cl::Hidden);
 
+static cl::opt<bool> ClPrintStats("tsan-print-stats",
+       cl::desc("Print ThreadSanitizer instrumentation stats"), cl::Hidden);
+
 namespace {
+
+// Stats counters for ThreadSanitizer instrumentation.
+struct ThreadSanitizerStats {
+  size_t NumInstrumentedReads;
+  size_t NumInstrumentedWrites;
+  size_t NumOmittedReadsBeforeWrite;
+  size_t NumAccessesWithBadSize;
+  size_t NumInstrumentedVtableWrites;
+  size_t NumOmittedReadsFromConstantGlobals;
+  size_t NumOmittedReadsFromVtable;
+};
+
 /// ThreadSanitizer: instrument the code in module to find races.
 struct ThreadSanitizer : public FunctionPass {
   ThreadSanitizer();
   bool runOnFunction(Function &F);
   bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
   bool instrumentLoadOrStore(Instruction *I);
   static char ID;  // Pass identification, replacement for typeid.
 
  private:
+  void choseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
+                                     SmallVectorImpl<Instruction*> &All);
+  bool addrPointsToConstantData(Value *Addr);
+
   TargetData *TD;
   OwningPtr<FunctionBlackList> BL;
   // Callbacks to run-time library are computed in doInitialization.
@@ -61,6 +85,10 @@ struct ThreadSanitizer : public FunctionPass {
   static const size_t kNumberOfAccessSizes = 5;
   Value *TsanRead[kNumberOfAccessSizes];
   Value *TsanWrite[kNumberOfAccessSizes];
+  Value *TsanVptrUpdate;
+
+  // Stats are modified w/o synchronization.
+  ThreadSanitizerStats stats;
 };
 }  // namespace
 
@@ -83,6 +111,7 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   if (!TD)
     return false;
   BL.reset(new FunctionBlackList(ClBlackListFile));
+  memset(&stats, 0, sizeof(stats));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -105,14 +134,103 @@ bool ThreadSanitizer::doInitialization(Module &M) {
     TsanWrite[i] = M.getOrInsertFunction(WriteName, IRB.getVoidTy(),
                                          IRB.getInt8PtrTy(), NULL);
   }
+  TsanVptrUpdate = M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(),
+                                         IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                                         NULL);
+  return true;
+}
+
+bool ThreadSanitizer::doFinalization(Module &M) {
+  if (ClPrintStats) {
+    errs() << "ThreadSanitizerStats " << M.getModuleIdentifier()
+           << ": wr " << stats.NumInstrumentedWrites
+           << "; rd " << stats.NumInstrumentedReads
+           << "; vt " << stats.NumInstrumentedVtableWrites
+           << "; bs " << stats.NumAccessesWithBadSize
+           << "; rbw " << stats.NumOmittedReadsBeforeWrite
+           << "; rcg " << stats.NumOmittedReadsFromConstantGlobals
+           << "; rvt " << stats.NumOmittedReadsFromVtable
+           << "\n";
+  }
   return true;
 }
 
+static bool isVtableAccess(Instruction *I) {
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) {
+    if (Tag->getNumOperands() < 1) return false;
+    if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
+      if (Tag1->getString() == "vtable pointer") return true;
+    }
+  }
+  return false;
+}
+
+bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->isConstant()) {
+      // Reads from constant globals can not race with any writes.
+      stats.NumOmittedReadsFromConstantGlobals++;
+      return true;
+    }
+  } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+    if (isVtableAccess(L)) {
+      // Reads from a vtable pointer can not race with any writes.
+      stats.NumOmittedReadsFromVtable++;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Instrumenting some of the accesses may be proven redundant.
+// Currently handled:
+//  - read-before-write (within same BB, no calls between)
+//
+// We do not handle some of the patterns that should not survive
+// after the classic compiler optimizations.
+// E.g. two reads from the same temp should be eliminated by CSE,
+// two writes should be eliminated by DSE, etc.
+//
+// 'Local' is a vector of insns within the same BB (no calls between).
+// 'All' is a vector of insns that will be instrumented.
+void ThreadSanitizer::choseInstructionsToInstrument(
+    SmallVectorImpl<Instruction*> &Local,
+    SmallVectorImpl<Instruction*> &All) {
+  SmallSet<Value*, 8> WriteTargets;
+  // Iterate from the end.
+  for (SmallVectorImpl<Instruction*>::reverse_iterator It = Local.rbegin(),
+       E = Local.rend(); It != E; ++It) {
+    Instruction *I = *It;
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      WriteTargets.insert(Store->getPointerOperand());
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      if (WriteTargets.count(Addr)) {
+        // We will write to this temp, so no reason to analyze the read.
+        stats.NumOmittedReadsBeforeWrite++;
+        continue;
+      }
+      if (addrPointsToConstantData(Addr)) {
+        // Addr points to some constant data -- it can not race with any writes.
+        continue;
+      }
+    }
+    All.push_back(I);
+  }
+  Local.clear();
+}
+
 bool ThreadSanitizer::runOnFunction(Function &F) {
   if (!TD) return false;
   if (BL->isIn(F)) return false;
   SmallVector<Instruction*, 8> RetVec;
-  SmallVector<Instruction*, 8> LoadsAndStores;
+  SmallVector<Instruction*, 8> AllLoadsAndStores;
+  SmallVector<Instruction*, 8> LocalLoadsAndStores;
   bool Res = false;
   bool HasCalls = false;
 
@@ -123,12 +241,15 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
          BI != BE; ++BI) {
       if (isa<LoadInst>(BI) || isa<StoreInst>(BI))
-        LoadsAndStores.push_back(BI);
+        LocalLoadsAndStores.push_back(BI);
       else if (isa<ReturnInst>(BI))
         RetVec.push_back(BI);
-      else if (isa<CallInst>(BI) || isa<InvokeInst>(BI))
+      else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
         HasCalls = true;
+        choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+      }
     }
+    choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
   }
 
   // We have collected all loads and stores.
@@ -136,8 +257,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   // (e.g. variables that do not escape, etc).
 
   // Instrument memory accesses.
-  for (size_t i = 0, n = LoadsAndStores.size(); i < n; ++i) {
-    Res |= instrumentLoadOrStore(LoadsAndStores[i]);
+  for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
+    Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
   }
 
   // Instrument function entry/exit points if there were instrumented accesses.
@@ -151,6 +272,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
       IRBuilder<> IRBRet(RetVec[i]);
       IRBRet.CreateCall(TsanFuncExit);
     }
+    Res = true;
   }
   return Res;
 }
@@ -167,12 +289,23 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
   uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
   if (TypeSize != 8  && TypeSize != 16 &&
       TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    stats.NumAccessesWithBadSize++;
     // Ignore all unusual sizes.
     return false;
   }
+  if (IsWrite && isVtableAccess(I)) {
+    Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
+    IRB.CreateCall2(TsanVptrUpdate,
+                    IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                    IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy()));
+    stats.NumInstrumentedVtableWrites++;
+    return true;
+  }
   size_t Idx = CountTrailingZeros_32(TypeSize / 8);
   assert(Idx < kNumberOfAccessSizes);
   Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
   IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+  if (IsWrite) stats.NumInstrumentedWrites++;
+  else         stats.NumInstrumentedReads++;
   return true;
 }
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 020ec57..9a5423f 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -567,8 +567,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
     // happens.
     WeakVH IterHandle(CurInstIterator);
     
-    ReplaceAndSimplifyAllUses(CI, RetVal, TLI ? TLI->getTargetData() : 0,
-                              TLInfo, ModifiedDT ? 0 : DT);
+    replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0,
+                                  TLInfo, ModifiedDT ? 0 : DT);
 
     // If the iterator instruction was recursively deleted, start over at the
     // start of the block.
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index ac80c48..fb733ad 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1974,109 +1974,119 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
 /// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
 /// 'RHS' everywhere in the scope.  Returns whether a change was made.
 bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) {
-  if (LHS == RHS) return false;
-  assert(LHS->getType() == RHS->getType() && "Equal but types differ!");
+  SmallVector<std::pair<Value*, Value*>, 4> Worklist;
+  Worklist.push_back(std::make_pair(LHS, RHS));
+  bool Changed = false;
 
-  // Don't try to propagate equalities between constants.
-  if (isa<Constant>(LHS) && isa<Constant>(RHS))
-    return false;
+  while (!Worklist.empty()) {
+    std::pair<Value*, Value*> Item = Worklist.pop_back_val();
+    LHS = Item.first; RHS = Item.second;
+
+    if (LHS == RHS) continue;
+    assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
+
+    // Don't try to propagate equalities between constants.
+    if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue;
 
-  // Prefer a constant on the right-hand side, or an Argument if no constants.
-  if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
-    std::swap(LHS, RHS);
-  assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
-
-  // If there is no obvious reason to prefer the left-hand side over the right-
-  // hand side, ensure the longest lived term is on the right-hand side, so the
-  // shortest lived term will be replaced by the longest lived.  This tends to
-  // expose more simplifications.
-  uint32_t LVN = VN.lookup_or_add(LHS);
-  if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
-      (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
-    // Move the 'oldest' value to the right-hand side, using the value number as
-    // a proxy for age.
-    uint32_t RVN = VN.lookup_or_add(RHS);
-    if (LVN < RVN) {
+    // Prefer a constant on the right-hand side, or an Argument if no constants.
+    if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
       std::swap(LHS, RHS);
-      LVN = RVN;
+    assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
+
+    // If there is no obvious reason to prefer the left-hand side over the right-
+    // hand side, ensure the longest lived term is on the right-hand side, so the
+    // shortest lived term will be replaced by the longest lived.  This tends to
+    // expose more simplifications.
+    uint32_t LVN = VN.lookup_or_add(LHS);
+    if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
+        (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
+      // Move the 'oldest' value to the right-hand side, using the value number as
+      // a proxy for age.
+      uint32_t RVN = VN.lookup_or_add(RHS);
+      if (LVN < RVN) {
+        std::swap(LHS, RHS);
+        LVN = RVN;
+      }
+    }
+    assert((!isa<Instruction>(RHS) ||
+            DT->properlyDominates(cast<Instruction>(RHS)->getParent(), Root)) &&
+           "Instruction doesn't dominate scope!");
+
+    // If value numbering later deduces that an instruction in the scope is equal
+    // to 'LHS' then ensure it will be turned into 'RHS'.
+    addToLeaderTable(LVN, RHS, Root);
+
+    // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
+    // LHS always has at least one use that is not dominated by Root, this will
+    // never do anything if LHS has only one use.
+    if (!LHS->hasOneUse()) {
+      unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root);
+      Changed |= NumReplacements > 0;
+      NumGVNEqProp += NumReplacements;
     }
-  }
-
-  // If value numbering later deduces that an instruction in the scope is equal
-  // to 'LHS' then ensure it will be turned into 'RHS'.
-  addToLeaderTable(LVN, RHS, Root);
 
-  // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
-  // LHS always has at least one use that is not dominated by Root, this will
-  // never do anything if LHS has only one use.
-  bool Changed = false;
-  if (!LHS->hasOneUse()) {
-    unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root);
-    Changed |= NumReplacements > 0;
-    NumGVNEqProp += NumReplacements;
-  }
-
-  // Now try to deduce additional equalities from this one.  For example, if the
-  // known equality was "(A != B)" == "false" then it follows that A and B are
-  // equal in the scope.  Only boolean equalities with an explicit true or false
-  // RHS are currently supported.
-  if (!RHS->getType()->isIntegerTy(1))
-    // Not a boolean equality - bail out.
-    return Changed;
-  ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
-  if (!CI)
-    // RHS neither 'true' nor 'false' - bail out.
-    return Changed;
-  // Whether RHS equals 'true'.  Otherwise it equals 'false'.
-  bool isKnownTrue = CI->isAllOnesValue();
-  bool isKnownFalse = !isKnownTrue;
-
-  // If "A && B" is known true then both A and B are known true.  If "A || B"
-  // is known false then both A and B are known false.
-  Value *A, *B;
-  if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
-      (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
-    Changed |= propagateEquality(A, RHS, Root);
-    Changed |= propagateEquality(B, RHS, Root);
-    return Changed;
-  }
+    // Now try to deduce additional equalities from this one.  For example, if the
+    // known equality was "(A != B)" == "false" then it follows that A and B are
+    // equal in the scope.  Only boolean equalities with an explicit true or false
+    // RHS are currently supported.
+    if (!RHS->getType()->isIntegerTy(1))
+      // Not a boolean equality - bail out.
+      continue;
+    ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+    if (!CI)
+      // RHS neither 'true' nor 'false' - bail out.
+      continue;
+    // Whether RHS equals 'true'.  Otherwise it equals 'false'.
+    bool isKnownTrue = CI->isAllOnesValue();
+    bool isKnownFalse = !isKnownTrue;
+
+    // If "A && B" is known true then both A and B are known true.  If "A || B"
+    // is known false then both A and B are known false.
+    Value *A, *B;
+    if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+        (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+      Worklist.push_back(std::make_pair(A, RHS));
+      Worklist.push_back(std::make_pair(B, RHS));
+      continue;
+    }
 
-  // If we are propagating an equality like "(A == B)" == "true" then also
-  // propagate the equality A == B.  When propagating a comparison such as
-  // "(A >= B)" == "true", replace all instances of "A < B" with "false".
-  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
-    Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
-
-    // If "A == B" is known true, or "A != B" is known false, then replace
-    // A with B everywhere in the scope.
-    if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
-        (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
-      Changed |= propagateEquality(Op0, Op1, Root);
-
-    // If "A >= B" is known true, replace "A < B" with false everywhere.
-    CmpInst::Predicate NotPred = Cmp->getInversePredicate();
-    Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
-    // Since we don't have the instruction "A < B" immediately to hand, work out
-    // the value number that it would have and use that to find an appropriate
-    // instruction (if any).
-    uint32_t NextNum = VN.getNextUnusedValueNumber();
-    uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
-    // If the number we were assigned was brand new then there is no point in
-    // looking for an instruction realizing it: there cannot be one!
-    if (Num < NextNum) {
-      Value *NotCmp = findLeader(Root, Num);
-      if (NotCmp && isa<Instruction>(NotCmp)) {
-        unsigned NumReplacements =
-          replaceAllDominatedUsesWith(NotCmp, NotVal, Root);
-        Changed |= NumReplacements > 0;
-        NumGVNEqProp += NumReplacements;
+    // If we are propagating an equality like "(A == B)" == "true" then also
+    // propagate the equality A == B.  When propagating a comparison such as
+    // "(A >= B)" == "true", replace all instances of "A < B" with "false".
+    if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
+      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+
+      // If "A == B" is known true, or "A != B" is known false, then replace
+      // A with B everywhere in the scope.
+      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+          (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
+        Worklist.push_back(std::make_pair(Op0, Op1));
+
+      // If "A >= B" is known true, replace "A < B" with false everywhere.
+      CmpInst::Predicate NotPred = Cmp->getInversePredicate();
+      Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
+      // Since we don't have the instruction "A < B" immediately to hand, work out
+      // the value number that it would have and use that to find an appropriate
+      // instruction (if any).
+      uint32_t NextNum = VN.getNextUnusedValueNumber();
+      uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+      // If the number we were assigned was brand new then there is no point in
+      // looking for an instruction realizing it: there cannot be one!
+      if (Num < NextNum) {
+        Value *NotCmp = findLeader(Root, Num);
+        if (NotCmp && isa<Instruction>(NotCmp)) {
+          unsigned NumReplacements =
+            replaceAllDominatedUsesWith(NotCmp, NotVal, Root);
+          Changed |= NumReplacements > 0;
+          NumGVNEqProp += NumReplacements;
+        }
       }
-    }
-    // Ensure that any instruction in scope that gets the "A < B" value number
-    // is replaced with false.
-    addToLeaderTable(Num, NotVal, Root);
+      // Ensure that any instruction in scope that gets the "A < B" value number
+      // is replaced with false.
+      addToLeaderTable(Num, NotVal, Root);
 
-    return Changed;
+      continue;
+    }
   }
 
   return Changed;
@@ -2325,7 +2335,14 @@ bool GVN::performPRE(Function &F) {
           CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
           isa<DbgInfoIntrinsic>(CurInst))
         continue;
-      
+
+      // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+      // sinking the compare again, and it would force the code generator to
+      // move the i1 from processor flags or predicate registers into a general
+      // purpose register.
+      if (isa<CmpInst>(CurInst))
+        continue;
+
       // We don't currently value number ANY inline asm calls.
       if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
         if (CallI->isInlineAsm())
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 490617a..a9ba657 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -33,7 +33,6 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Type.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -50,18 +49,12 @@
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-STATISTIC(NumRemoved     , "Number of aux indvars removed");
 STATISTIC(NumWidened     , "Number of indvars widened");
-STATISTIC(NumInserted    , "Number of canonical indvars added");
 STATISTIC(NumReplaced    , "Number of exit values replaced");
 STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
 STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated");
 STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
 
-static cl::opt<bool> EnableIVRewrite(
-  "enable-iv-rewrite", cl::Hidden,
-  cl::desc("Enable canonical induction variable rewriting"));
-
 // Trip count verification can be enabled by default under NDEBUG if we
 // implement a strong expression equivalence checker in SCEV. Until then, we
 // use the verify-indvars flag, which may assert in some cases.
@@ -71,7 +64,6 @@ static cl::opt<bool> VerifyIndvars(
 
 namespace {
   class IndVarSimplify : public LoopPass {
-    IVUsers         *IU;
     LoopInfo        *LI;
     ScalarEvolution *SE;
     DominatorTree   *DT;
@@ -82,7 +74,7 @@ namespace {
   public:
 
     static char ID; // Pass identification, replacement for typeid
-    IndVarSimplify() : LoopPass(ID), IU(0), LI(0), SE(0), DT(0), TD(0),
+    IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), TD(0),
                        Changed(false) {
       initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
     }
@@ -95,13 +87,9 @@ namespace {
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
-      if (EnableIVRewrite)
-        AU.addRequired<IVUsers>();
       AU.addPreserved<ScalarEvolution>();
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
-      if (EnableIVRewrite)
-        AU.addPreserved<IVUsers>();
       AU.setPreservesCFG();
     }
 
@@ -119,8 +107,6 @@ namespace {
 
     void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
 
-    void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter);
-
     Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
                                      PHINode *IndVar, SCEVExpander &Rewriter);
 
@@ -136,7 +122,6 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
-INITIALIZE_PASS_DEPENDENCY(IVUsers)
 INITIALIZE_PASS_END(IndVarSimplify, "indvars",
                 "Induction Variable Simplification", false, false)
 
@@ -448,13 +433,6 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
     PN->replaceAllUsesWith(Conv);
     RecursivelyDeleteTriviallyDeadInstructions(PN);
   }
-
-  // Add a new IVUsers entry for the newly-created integer PHI.
-  if (IU) {
-    SmallPtrSet<Loop*, 16> SimplifiedLoopNests;
-    IU->AddUsersIfInteresting(NewPHI, SimplifiedLoopNests);
-  }
-
   Changed = true;
 }
 
@@ -600,124 +578,6 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 }
 
 //===----------------------------------------------------------------------===//
-//  Rewrite IV users based on a canonical IV.
-//  Only for use with -enable-iv-rewrite.
-//===----------------------------------------------------------------------===//
-
-/// FIXME: It is an extremely bad idea to indvar substitute anything more
-/// complex than affine induction variables.  Doing so will put expensive
-/// polynomial evaluations inside of the loop, and the str reduction pass
-/// currently can only reduce affine polynomials.  For now just disable
-/// indvar subst on anything more complex than an affine addrec, unless
-/// it can be expanded to a trivial value.
-static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) {
-  // Loop-invariant values are safe.
-  if (SE->isLoopInvariant(S, L)) return true;
-
-  // Affine addrecs are safe. Non-affine are not, because LSR doesn't know how
-  // to transform them into efficient code.
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
-    return AR->isAffine();
-
-  // An add is safe it all its operands are safe.
-  if (const SCEVCommutativeExpr *Commutative
-      = dyn_cast<SCEVCommutativeExpr>(S)) {
-    for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(),
-         E = Commutative->op_end(); I != E; ++I)
-      if (!isSafe(*I, L, SE)) return false;
-    return true;
-  }
-
-  // A cast is safe if its operand is.
-  if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
-    return isSafe(C->getOperand(), L, SE);
-
-  // A udiv is safe if its operands are.
-  if (const SCEVUDivExpr *UD = dyn_cast<SCEVUDivExpr>(S))
-    return isSafe(UD->getLHS(), L, SE) &&
-           isSafe(UD->getRHS(), L, SE);
-
-  // SCEVUnknown is always safe.
-  if (isa<SCEVUnknown>(S))
-    return true;
-
-  // Nothing else is safe.
-  return false;
-}
-
-void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) {
-  // Rewrite all induction variable expressions in terms of the canonical
-  // induction variable.
-  //
-  // If there were induction variables of other sizes or offsets, manually
-  // add the offsets to the primary induction variable and cast, avoiding
-  // the need for the code evaluation methods to insert induction variables
-  // of different sizes.
-  for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) {
-    Value *Op = UI->getOperandValToReplace();
-    Type *UseTy = Op->getType();
-    Instruction *User = UI->getUser();
-
-    // Compute the final addrec to expand into code.
-    const SCEV *AR = IU->getReplacementExpr(*UI);
-
-    // Evaluate the expression out of the loop, if possible.
-    if (!L->contains(UI->getUser())) {
-      const SCEV *ExitVal = SE->getSCEVAtScope(AR, L->getParentLoop());
-      if (SE->isLoopInvariant(ExitVal, L))
-        AR = ExitVal;
-    }
-
-    // FIXME: It is an extremely bad idea to indvar substitute anything more
-    // complex than affine induction variables.  Doing so will put expensive
-    // polynomial evaluations inside of the loop, and the str reduction pass
-    // currently can only reduce affine polynomials.  For now just disable
-    // indvar subst on anything more complex than an affine addrec, unless
-    // it can be expanded to a trivial value.
-    if (!isSafe(AR, L, SE))
-      continue;
-
-    // Determine the insertion point for this user. By default, insert
-    // immediately before the user. The SCEVExpander class will automatically
-    // hoist loop invariants out of the loop. For PHI nodes, there may be
-    // multiple uses, so compute the nearest common dominator for the
-    // incoming blocks.
-    Instruction *InsertPt = getInsertPointForUses(User, Op, DT);
-
-    // Now expand it into actual Instructions and patch it into place.
-    Value *NewVal = Rewriter.expandCodeFor(AR, UseTy, InsertPt);
-
-    DEBUG(dbgs() << "INDVARS: Rewrote IV '" << *AR << "' " << *Op << '\n'
-                 << "   into = " << *NewVal << "\n");
-
-    if (!isValidRewrite(Op, NewVal)) {
-      DeadInsts.push_back(NewVal);
-      continue;
-    }
-    // Inform ScalarEvolution that this value is changing. The change doesn't
-    // affect its value, but it does potentially affect which use lists the
-    // value will be on after the replacement, which affects ScalarEvolution's
-    // ability to walk use lists and drop dangling pointers when a value is
-    // deleted.
-    SE->forgetValue(User);
-
-    // Patch the new value into place.
-    if (Op->hasName())
-      NewVal->takeName(Op);
-    if (Instruction *NewValI = dyn_cast<Instruction>(NewVal))
-      NewValI->setDebugLoc(User->getDebugLoc());
-    User->replaceUsesOfWith(Op, NewVal);
-    UI->setOperandValToReplace(NewVal);
-
-    ++NumRemoved;
-    Changed = true;
-
-    // The old value may be dead now.
-    DeadInsts.push_back(Op);
-  }
-}
-
-//===----------------------------------------------------------------------===//
 //  IV Widening - Extend the width of an IV to cover its widest uses.
 //===----------------------------------------------------------------------===//
 
@@ -1262,9 +1122,6 @@ static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
     }
   }
 
-  if (EnableIVRewrite)
-    return false;
-
   // Recurse past add expressions, which commonly occur in the
   // BackedgeTakenCount. They may already exist in program code, and if not,
   // they are not too expensive rematerialize.
@@ -1321,36 +1178,6 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) {
   return true;
 }
 
-/// getBackedgeIVType - Get the widest type used by the loop test after peeking
-/// through Truncs.
-///
-/// TODO: Unnecessary when ForceLFTR is removed.
-static Type *getBackedgeIVType(Loop *L) {
-  if (!L->getExitingBlock())
-    return 0;
-
-  // Can't rewrite non-branch yet.
-  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
-  if (!BI)
-    return 0;
-
-  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
-  if (!Cond)
-    return 0;
-
-  Type *Ty = 0;
-  for(User::op_iterator OI = Cond->op_begin(), OE = Cond->op_end();
-      OI != OE; ++OI) {
-    assert((!Ty || Ty == (*OI)->getType()) && "bad icmp operand types");
-    TruncInst *Trunc = dyn_cast<TruncInst>(*OI);
-    if (!Trunc)
-      continue;
-
-    return Trunc->getSrcTy();
-  }
-  return Ty;
-}
-
 /// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop
 /// invariant value to the phi.
 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
@@ -1619,8 +1446,7 @@ LinearFunctionTestReplace(Loop *L,
 
   // LFTR can ignore IV overflow and truncate to the width of
   // BECount. This avoids materializing the add(zext(add)) expression.
-  Type *CntTy = !EnableIVRewrite ?
-    BackedgeTakenCount->getType() : IndVar->getType();
+  Type *CntTy = BackedgeTakenCount->getType();
 
   const SCEV *IVCount = BackedgeTakenCount;
 
@@ -1805,8 +1631,6 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!L->isLoopSimplifyForm())
     return false;
 
-  if (EnableIVRewrite)
-    IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTree>();
@@ -1833,10 +1657,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // attempt to avoid evaluating SCEVs for sign/zero extend operations until
   // other expressions involving loop IVs have been evaluated. This helps SCEV
   // set no-wrap flags before normalizing sign/zero extension.
-  if (!EnableIVRewrite) {
-    Rewriter.disableCanonicalMode();
-    SimplifyAndExtend(L, Rewriter, LPM);
-  }
+  Rewriter.disableCanonicalMode();
+  SimplifyAndExtend(L, Rewriter, LPM);
 
   // Check to see if this loop has a computable loop-invariant execution count.
   // If so, this means that we can compute the final value of any expressions
@@ -1847,106 +1669,28 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     RewriteLoopExitValues(L, Rewriter);
 
-  // Eliminate redundant IV users.
-  if (EnableIVRewrite)
-    Changed |= simplifyIVUsers(IU, SE, &LPM, DeadInsts);
-
   // Eliminate redundant IV cycles.
-  if (!EnableIVRewrite)
-    NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
-
-  // Compute the type of the largest recurrence expression, and decide whether
-  // a canonical induction variable should be inserted.
-  Type *LargestType = 0;
-  bool NeedCannIV = false;
-  bool ExpandBECount = canExpandBackedgeTakenCount(L, SE);
-  if (EnableIVRewrite && ExpandBECount) {
-    // If we have a known trip count and a single exit block, we'll be
-    // rewriting the loop exit test condition below, which requires a
-    // canonical induction variable.
-    NeedCannIV = true;
-    Type *Ty = BackedgeTakenCount->getType();
-    if (!EnableIVRewrite) {
-      // In this mode, SimplifyIVUsers may have already widened the IV used by
-      // the backedge test and inserted a Trunc on the compare's operand. Get
-      // the wider type to avoid creating a redundant narrow IV only used by the
-      // loop test.
-      LargestType = getBackedgeIVType(L);
-    }
-    if (!LargestType ||
-        SE->getTypeSizeInBits(Ty) >
-        SE->getTypeSizeInBits(LargestType))
-      LargestType = SE->getEffectiveSCEVType(Ty);
-  }
-  if (EnableIVRewrite) {
-    for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
-      NeedCannIV = true;
-      Type *Ty =
-        SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
-      if (!LargestType ||
-          SE->getTypeSizeInBits(Ty) >
-          SE->getTypeSizeInBits(LargestType))
-        LargestType = Ty;
-    }
-  }
-
-  // Now that we know the largest of the induction variable expressions
-  // in this loop, insert a canonical induction variable of the largest size.
-  PHINode *IndVar = 0;
-  if (NeedCannIV) {
-    // Check to see if the loop already has any canonical-looking induction
-    // variables. If any are present and wider than the planned canonical
-    // induction variable, temporarily remove them, so that the Rewriter
-    // doesn't attempt to reuse them.
-    SmallVector<PHINode *, 2> OldCannIVs;
-    while (PHINode *OldCannIV = L->getCanonicalInductionVariable()) {
-      if (SE->getTypeSizeInBits(OldCannIV->getType()) >
-          SE->getTypeSizeInBits(LargestType))
-        OldCannIV->removeFromParent();
-      else
-        break;
-      OldCannIVs.push_back(OldCannIV);
-    }
-
-    IndVar = Rewriter.getOrInsertCanonicalInductionVariable(L, LargestType);
-
-    ++NumInserted;
-    Changed = true;
-    DEBUG(dbgs() << "INDVARS: New CanIV: " << *IndVar << '\n');
+  NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
 
-    // Now that the official induction variable is established, reinsert
-    // any old canonical-looking variables after it so that the IR remains
-    // consistent. They will be deleted as part of the dead-PHI deletion at
-    // the end of the pass.
-    while (!OldCannIVs.empty()) {
-      PHINode *OldCannIV = OldCannIVs.pop_back_val();
-      OldCannIV->insertBefore(L->getHeader()->getFirstInsertionPt());
-    }
-  }
-  else if (!EnableIVRewrite && ExpandBECount && needsLFTR(L, DT)) {
-    IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD);
-  }
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
-  Value *NewICmp = 0;
-  if (ExpandBECount && IndVar) {
-    // Check preconditions for proper SCEVExpander operation. SCEV does not
-    // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
-    // pass that uses the SCEVExpander must do it. This does not work well for
-    // loop passes because SCEVExpander makes assumptions about all loops, while
-    // LoopPassManager only forces the current loop to be simplified.
-    //
-    // FIXME: SCEV expansion has no way to bail out, so the caller must
-    // explicitly check any assumptions made by SCEV. Brittle.
-    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
-    if (!AR || AR->getLoop()->getLoopPreheader())
-      NewICmp =
-        LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter);
+  if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) {
+    PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD);
+    if (IndVar) {
+      // Check preconditions for proper SCEVExpander operation. SCEV does not
+      // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
+      // pass that uses the SCEVExpander must do it. This does not work well for
+      // loop passes because SCEVExpander makes assumptions about all loops, while
+      // LoopPassManager only forces the current loop to be simplified.
+      //
+      // FIXME: SCEV expansion has no way to bail out, so the caller must
+      // explicitly check any assumptions made by SCEV. Brittle.
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
+      if (!AR || AR->getLoop()->getLoopPreheader())
+        (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                        Rewriter);
+    }
   }
-  // Rewrite IV-derived expressions.
-  if (EnableIVRewrite)
-    RewriteIVExpressions(L, Rewriter);
-
   // Clear the rewriter cache, because values that are in the rewriter's cache
   // can be deleted in the loop below, causing the AssertingVH in the cache to
   // trigger.
@@ -1965,16 +1709,6 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // loop may be sunk below the loop to reduce register pressure.
   SinkUnusedInvariants(L);
 
-  // For completeness, inform IVUsers of the IV use in the newly-created
-  // loop exit test instruction.
-  if (IU && NewICmp) {
-    ICmpInst *NewICmpInst = dyn_cast<ICmpInst>(NewICmp);
-    if (NewICmpInst) {
-      SmallPtrSet<Loop*, 16> SimplifiedLoopNests;
-      IU->AddUsersIfInteresting(cast<Instruction>(NewICmpInst->getOperand(0)),
-                                SimplifiedLoopNests);
-    }
-  }
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader());
   // Check a post-condition.
@@ -1984,8 +1718,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Verify that LFTR, and any other change have not interfered with SCEV's
   // ability to compute trip count.
 #ifndef NDEBUG
-  if (!EnableIVRewrite && VerifyIndvars &&
-      !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+  if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
     SE->forgetLoop(L);
     const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
     if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 82d918e..fe4700b 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -77,11 +77,11 @@
 #include <algorithm>
 using namespace llvm;
 
-static cl::opt<bool> EnableNested(
-  "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
-
-static cl::opt<bool> EnableRetry(
-  "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
+/// bail out. This threshold is far beyond the number of users that LSR can
+/// conceivably solve, so it should not affect generated code, but catches the
+/// worst cases before LSR burns too much compile time and stack space.
+static const unsigned MaxIVUsers = 200;
 
 // Temporary flag to cleanup congruent phis after LSR phi expansion.
 // It's currently disabled until we can determine whether it's truly useful or
@@ -710,8 +710,9 @@ static bool isHighCostExpansion(const SCEV *S,
         Value *UVal = U->getValue();
         for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end();
              UI != UE; ++UI) {
-          Instruction *User = cast<Instruction>(*UI);
-          if (User->getOpcode() == Instruction::Mul
+          // If U is a constant, it may be used by a ConstantExpr.
+          Instruction *User = dyn_cast<Instruction>(*UI);
+          if (User && User->getOpcode() == Instruction::Mul
               && SE.isSCEVable(User->getType())) {
             return SE.getSCEV(User) == Mul;
           }
@@ -824,36 +825,20 @@ void Cost::RateRegister(const SCEV *Reg,
                         const Loop *L,
                         ScalarEvolution &SE, DominatorTree &DT) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
-    if (AR->getLoop() == L)
-      AddRecCost += 1; /// TODO: This should be a function of the stride.
-
     // If this is an addrec for another loop, don't second-guess its addrec phi
     // nodes. LSR isn't currently smart enough to reason about more than one
-    // loop at a time. LSR has either already run on inner loops, will not run
-    // on other loops, and cannot be expected to change sibling loops. If the
-    // AddRec exists, consider it's register free and leave it alone. Otherwise,
-    // do not consider this formula at all.
-    else if (!EnableNested || L->contains(AR->getLoop()) ||
-             (!AR->getLoop()->contains(L) &&
-              DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
+    // loop at a time. LSR has already run on inner loops, will not run on outer
+    // loops, and cannot be expected to change sibling loops.
+    if (AR->getLoop() != L) {
+      // If the AddRec exists, consider it's register free and leave it alone.
       if (isExistingPhi(AR, SE))
         return;
 
-      // For !EnableNested, never rewrite IVs in other loops.
-      if (!EnableNested) {
-        Loose();
-        return;
-      }
-      // If this isn't one of the addrecs that the loop already has, it
-      // would require a costly new phi and add. TODO: This isn't
-      // precisely modeled right now.
-      ++NumBaseAdds;
-      if (!Regs.count(AR->getStart())) {
-        RateRegister(AR->getStart(), Regs, L, SE, DT);
-        if (isLoser())
-          return;
-      }
+      // Otherwise, do not consider this formula at all.
+      Loose();
+      return;
     }
+    AddRecCost += 1; /// TODO: This should be a function of the stride.
 
     // Add the step value register, if it needs one.
     // TODO: The non-affine case isn't precisely modeled here.
@@ -1303,10 +1288,19 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
     if (AM.BaseOffs != 0) {
-      if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs);
-      return false;
+      if (!TLI)
+        return false;
+      // We have one of:
+      // ICmpZero     BaseReg + Offset => ICmp BaseReg, -Offset
+      // ICmpZero -1*ScaleReg + Offset => ICmp ScaleReg, Offset
+      // Offs is the ICmp immediate.
+      int64_t Offs = AM.BaseOffs;
+      if (AM.Scale == 0)
+        Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN.
+      return TLI->isLegalICmpImmediate(Offs);
     }
 
+    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
     return true;
 
   case LSRUse::Basic:
@@ -2193,7 +2187,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
     do {
       const SCEV *S = Worklist.pop_back_val();
       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
-        if (EnableNested || AR->getLoop() == L)
+        if (AR->getLoop() == L)
           Strides.insert(AR->getStepRecurrence(SE));
         Worklist.push_back(AR->getStart());
       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
@@ -2463,7 +2457,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
     if (!isCompatibleIVType(PrevIV, NextIV))
       continue;
 
-    // A phi nodes terminates a chain.
+    // A phi node terminates a chain.
     if (isa<PHINode>(UserInst)
         && isa<PHINode>(IVChainVec[ChainIdx].back().UserInst))
       continue;
@@ -2519,13 +2513,14 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
   for (Value::use_iterator UseIter = IVOper->use_begin(),
          UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) {
     Instruction *OtherUse = dyn_cast<Instruction>(*UseIter);
+    if (!OtherUse || OtherUse == UserInst)
+      continue;
     if (SE.isSCEVable(OtherUse->getType())
         && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
         && IU.isIVUserOrOperand(OtherUse)) {
       continue;
     }
-    if (OtherUse && OtherUse != UserInst)
-      NearUsers.insert(OtherUse);
+    NearUsers.insert(OtherUse);
   }
 
   // Since this user is part of the chain, it's no longer considered a use
@@ -3986,24 +3981,29 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     if (LU.Regs.count(*I))
       ReqRegs.insert(*I);
 
-  bool AnySatisfiedReqRegs = false;
   SmallPtrSet<const SCEV *, 16> NewRegs;
   Cost NewCost;
-retry:
   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
        E = LU.Formulae.end(); I != E; ++I) {
     const Formula &F = *I;
 
     // Ignore formulae which do not use any of the required registers.
+    bool SatisfiedReqReg = true;
     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
          JE = ReqRegs.end(); J != JE; ++J) {
       const SCEV *Reg = *J;
       if ((!F.ScaledReg || F.ScaledReg != Reg) &&
           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
-          F.BaseRegs.end())
-        goto skip;
+          F.BaseRegs.end()) {
+        SatisfiedReqReg = false;
+        break;
+      }
+    }
+    if (!SatisfiedReqReg) {
+      // If none of the formulae satisfied the required registers, then we could
+      // clear ReqRegs and try again. Currently, we simply give up in this case.
+      continue;
     }
-    AnySatisfiedReqRegs = true;
 
     // Evaluate the cost of the current formula. If it's already worse than
     // the current best, prune the search at that point.
@@ -4030,18 +4030,6 @@ retry:
       }
       Workspace.pop_back();
     }
-  skip:;
-  }
-
-  if (!EnableRetry && !AnySatisfiedReqRegs)
-    return;
-
-  // If none of the formulae had all of the required registers, relax the
-  // constraint so that we don't exclude all formulae.
-  if (!AnySatisfiedReqRegs) {
-    assert(!ReqRegs.empty() && "Solver failed even without required registers");
-    ReqRegs.clear();
-    goto retry;
   }
 }
 
@@ -4537,6 +4525,17 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   // If there's no interesting work to be done, bail early.
   if (IU.empty()) return;
 
+  // If there's too much analysis to be done, bail early. We won't be able to
+  // model the problem anyway.
+  unsigned NumUsers = 0;
+  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
+    if (++NumUsers > MaxIVUsers) {
+      DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L
+            << "\n");
+      return;
+    }
+  }
+
 #ifndef NDEBUG
   // All dominating loops must have preheaders, or SCEVExpander may not be able
   // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
@@ -4566,7 +4565,7 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   if (IU.empty()) return;
 
   // Skip nested loops until we can model them better with formulae.
-  if (!EnableNested && !L->empty()) {
+  if (!L->empty()) {
     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
     return;
   }
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 22dbfe3..09a186f 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -197,13 +197,13 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
       }
       if (TripCount) {
         // Reduce unroll count to be modulo of TripCount for partial unrolling
-        Count = CurrentThreshold / LoopSize;
+        Count = Threshold / LoopSize;
         while (Count != 0 && TripCount%Count != 0)
           Count--;
       }
       else if (UnrollRuntime) {
         // Reduce unroll count to be a lower power-of-two value
-        while (Count != 0 && Size > CurrentThreshold) {
+        while (Count != 0 && Size > Threshold) {
           Count >>= 1;
           Size = LoopSize*Count;
         }
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 053eb0c..00ecc74 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -64,63 +64,63 @@ STATISTIC(TotalInsts,  "Total number of instructions analyzed");
 static cl::opt<unsigned>
 Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
           cl::init(100), cl::Hidden);
-  
+
 namespace {
-  
+
   class LUAnalysisCache {
 
     typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> >
       UnswitchedValsMap;
-    
+
     typedef UnswitchedValsMap::iterator UnswitchedValsIt;
-      
+
     struct LoopProperties {
       unsigned CanBeUnswitchedCount;
       unsigned SizeEstimation;
       UnswitchedValsMap UnswitchedVals;
     };
-    
-    // Here we use std::map instead of DenseMap, since we need to keep valid 
+
+    // Here we use std::map instead of DenseMap, since we need to keep valid
     // LoopProperties pointer for current loop for better performance.
     typedef std::map<const Loop*, LoopProperties> LoopPropsMap;
     typedef LoopPropsMap::iterator LoopPropsMapIt;
-    
+
     LoopPropsMap LoopsProperties;
     UnswitchedValsMap* CurLoopInstructions;
     LoopProperties* CurrentLoopProperties;
-    
+
     // Max size of code we can produce on remained iterations.
     unsigned MaxSize;
-      
+
     public:
-    
+
       LUAnalysisCache() :
         CurLoopInstructions(NULL), CurrentLoopProperties(NULL),
         MaxSize(Threshold)
       {}
-    
+
       // Analyze loop. Check its size, calculate is it possible to unswitch
       // it. Returns true if we can unswitch this loop.
       bool countLoop(const Loop* L);
-      
+
       // Clean all data related to given loop.
       void forgetLoop(const Loop* L);
-      
+
       // Mark case value as unswitched.
       // Since SI instruction can be partly unswitched, in order to avoid
       // extra unswitching in cloned loops keep track all unswitched values.
       void setUnswitched(const SwitchInst* SI, const Value* V);
-      
+
       // Check was this case value unswitched before or not.
       bool isUnswitched(const SwitchInst* SI, const Value* V);
-      
+
       // Clone all loop-unswitch related loop properties.
       // Redistribute unswitching quotas.
       // Note, that new loop data is stored inside the VMap.
       void cloneData(const Loop* NewLoop, const Loop* OldLoop,
                      const ValueToValueMapTy& VMap);
   };
-  
+
   class LoopUnswitch : public LoopPass {
     LoopInfo *LI;  // Loop information
     LPPassManager *LPM;
@@ -130,7 +130,7 @@ namespace {
     std::vector<Loop*> LoopProcessWorklist;
 
     LUAnalysisCache BranchesInfo;
-    
+
     bool OptimizeForSize;
     bool redoLoop;
 
@@ -138,9 +138,9 @@ namespace {
     DominatorTree *DT;
     BasicBlock *loopHeader;
     BasicBlock *loopPreheader;
-    
+
     // LoopBlocks contains all of the basic blocks of the loop, including the
-    // preheader of the loop, the body of the loop, and the exit blocks of the 
+    // preheader of the loop, the body of the loop, and the exit blocks of the
     // loop, in that order.
     std::vector<BasicBlock*> LoopBlocks;
     // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
@@ -148,8 +148,8 @@ namespace {
 
   public:
     static char ID; // Pass ID, replacement for typeid
-    explicit LoopUnswitch(bool Os = false) : 
-      LoopPass(ID), OptimizeForSize(Os), redoLoop(false), 
+    explicit LoopUnswitch(bool Os = false) :
+      LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
       currentLoop(NULL), DT(NULL), loopHeader(NULL),
       loopPreheader(NULL) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
@@ -186,7 +186,7 @@ namespace {
       if (I != LoopProcessWorklist.end())
         LoopProcessWorklist.erase(I);
     }
-    
+
     void initLoopData() {
       loopHeader = currentLoop->getHeader();
       loopPreheader = currentLoop->getLoopPreheader();
@@ -205,7 +205,7 @@ namespace {
                                               Constant *Val, bool isEqual);
 
     void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
-                                        BasicBlock *TrueDest, 
+                                        BasicBlock *TrueDest,
                                         BasicBlock *FalseDest,
                                         Instruction *InsertPt);
 
@@ -222,12 +222,12 @@ namespace {
 // Analyze loop. Check its size, calculate is it possible to unswitch
 // it. Returns true if we can unswitch this loop.
 bool LUAnalysisCache::countLoop(const Loop* L) {
-  
+
   std::pair<LoopPropsMapIt, bool> InsertRes =
       LoopsProperties.insert(std::make_pair(L, LoopProperties()));
-  
+
   LoopProperties& Props = InsertRes.first->second;
-   
+
   if (InsertRes.second) {
     // New loop.
 
@@ -235,39 +235,39 @@ bool LUAnalysisCache::countLoop(const Loop* L) {
     // expansion, and the number of basic blocks, to avoid loops with
     // large numbers of branches which cause loop unswitching to go crazy.
     // This is a very ad-hoc heuristic.
-    
+
     // FIXME: This is overly conservative because it does not take into
     // consideration code simplification opportunities and code that can
     // be shared by the resultant unswitched loops.
     CodeMetrics Metrics;
-    for (Loop::block_iterator I = L->block_begin(), 
+    for (Loop::block_iterator I = L->block_begin(),
            E = L->block_end();
          I != E; ++I)
-      Metrics.analyzeBasicBlock(*I);    
+      Metrics.analyzeBasicBlock(*I);
 
     Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5);
     Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
     MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
-  }  
-   
+  }
+
   if (!Props.CanBeUnswitchedCount) {
     DEBUG(dbgs() << "NOT unswitching loop %"
           << L->getHeader()->getName() << ", cost too high: "
           << L->getBlocks().size() << "\n");
-    
+
     return false;
   }
-  
+
   // Be careful. This links are good only before new loop addition.
   CurrentLoopProperties = &Props;
   CurLoopInstructions = &Props.UnswitchedVals;
-  
+
   return true;
 }
 
 // Clean all data related to given loop.
 void LUAnalysisCache::forgetLoop(const Loop* L) {
-  
+
   LoopPropsMapIt LIt = LoopsProperties.find(L);
 
   if (LIt != LoopsProperties.end()) {
@@ -275,9 +275,9 @@ void LUAnalysisCache::forgetLoop(const Loop* L) {
     MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation;
     LoopsProperties.erase(LIt);
   }
-  
+
   CurrentLoopProperties = NULL;
-  CurLoopInstructions = NULL;  
+  CurLoopInstructions = NULL;
 }
 
 // Mark case value as unswitched.
@@ -289,7 +289,7 @@ void LUAnalysisCache::setUnswitched(const SwitchInst* SI, const Value* V) {
 
 // Check was this case value unswitched before or not.
 bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) {
-  return (*CurLoopInstructions)[SI].count(V); 
+  return (*CurLoopInstructions)[SI].count(V);
 }
 
 // Clone all loop-unswitch related loop properties.
@@ -297,20 +297,20 @@ bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) {
 // Note, that new loop data is stored inside the VMap.
 void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
                      const ValueToValueMapTy& VMap) {
-  
+
   LoopProperties& NewLoopProps = LoopsProperties[NewLoop];
   LoopProperties& OldLoopProps = *CurrentLoopProperties;
   UnswitchedValsMap& Insts = OldLoopProps.UnswitchedVals;
-  
+
   // Reallocate "can-be-unswitched quota"
 
   --OldLoopProps.CanBeUnswitchedCount;
   unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
   NewLoopProps.CanBeUnswitchedCount = Quota / 2;
   OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
-  
+
   NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
-  
+
   // Clone unswitched values info:
   // for new loop switches we clone info about values that was
   // already unswitched and has redundant successors.
@@ -319,7 +319,7 @@ void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
     Value* NewI = VMap.lookup(OldInst);
     const SwitchInst* NewInst = cast_or_null<SwitchInst>(NewI);
     assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
-    
+
     NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
   }
 }
@@ -333,18 +333,18 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 
-Pass *llvm::createLoopUnswitchPass(bool Os) { 
-  return new LoopUnswitch(Os); 
+Pass *llvm::createLoopUnswitchPass(bool Os) {
+  return new LoopUnswitch(Os);
 }
 
 /// FindLIVLoopCondition - Cond is a condition that occurs in L.  If it is
 /// invariant in the loop, or has an invariant piece, return the invariant.
 /// Otherwise, return null.
 static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
-  
+
   // We started analyze new instruction, increment scanned instructions counter.
   ++TotalInsts;
-  
+
   // We can never unswitch on vector conditions.
   if (Cond->getType()->isVectorTy())
     return 0;
@@ -369,7 +369,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
       if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed))
         return RHS;
     }
-  
+
   return 0;
 }
 
@@ -394,19 +394,36 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   return Changed;
 }
 
-/// processCurrentLoop - Do actual work and unswitch loop if possible 
+/// processCurrentLoop - Do actual work and unswitch loop if possible
 /// and profitable.
 bool LoopUnswitch::processCurrentLoop() {
   bool Changed = false;
 
   initLoopData();
-  
+
   // If LoopSimplify was unable to form a preheader, don't do any unswitching.
   if (!loopPreheader)
     return false;
-  
+
+  // Loops with indirectbr cannot be cloned.
+  if (!currentLoop->isSafeToClone())
+    return false;
+
+  // Loops with invokes, whose unwind edge escapes the loop, cannot be
+  // unswitched because splitting their edges are non-trivial and don't preserve
+  // loop simplify information.
+  for (Loop::block_iterator I = currentLoop->block_begin(),
+         E = currentLoop->block_end(); I != E; ++I)
+    if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator()))
+      if (!currentLoop->contains(II->getUnwindDest()))
+        return false;
+
+  // Without dedicated exits, splitting the exit edge may fail.
+  if (!currentLoop->hasDedicatedExits())
+    return false;
+
   LLVMContext &Context = loopHeader->getContext();
-  
+
   // Probably we reach the quota of branches for this loop. If so
   // stop unswitching.
   if (!BranchesInfo.countLoop(currentLoop))
@@ -415,7 +432,7 @@ bool LoopUnswitch::processCurrentLoop() {
   // Loop over all of the basic blocks in the loop.  If we find an interior
   // block that is branching on a loop-invariant condition, we can unswitch this
   // loop.
-  for (Loop::block_iterator I = currentLoop->block_begin(), 
+  for (Loop::block_iterator I = currentLoop->block_begin(),
          E = currentLoop->block_end(); I != E; ++I) {
     TerminatorInst *TI = (*I)->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -424,24 +441,24 @@ bool LoopUnswitch::processCurrentLoop() {
       if (BI->isConditional()) {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
-        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), 
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
                                                currentLoop, Changed);
-        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+        if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumBranches;
           return true;
         }
-      }      
+      }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
                                              currentLoop, Changed);
-      unsigned NumCases = SI->getNumCases(); 
+      unsigned NumCases = SI->getNumCases();
       if (LoopCond && NumCases) {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
         // FIXME: scan for a case with a non-critical edge?
         Constant *UnswitchVal = NULL;
-        
+
         // Do not process same value again and again.
         // At this point we have some cases already unswitched and
         // some not yet unswitched. Let's find the first not yet unswitched one.
@@ -453,7 +470,7 @@ bool LoopUnswitch::processCurrentLoop() {
             break;
           }
         }
-        
+
         if (!UnswitchVal)
           continue;
 
@@ -463,14 +480,14 @@ bool LoopUnswitch::processCurrentLoop() {
         }
       }
     }
-    
+
     // Scan the instructions to check for unswitchable values.
-    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); 
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
-        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), 
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
                                                currentLoop, Changed);
-        if (LoopCond && UnswitchIfProfitable(LoopCond, 
+        if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumSelects;
           return true;
@@ -500,7 +517,7 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
     ExitBB = BB;
     return true;
   }
-  
+
   // Otherwise, this is an unvisited intra-loop node.  Check all successors.
   for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
     // Check to see if the successor is a trivial loop exit.
@@ -513,12 +530,12 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
     if (I->mayHaveSideEffects())
       return false;
-  
+
   return true;
 }
 
 /// isTrivialLoopExitBlock - Return true if the specified block unconditionally
-/// leads to an exit from the specified loop, and has no side-effects in the 
+/// leads to an exit from the specified loop, and has no side-effects in the
 /// process.  If so, return the block that is exited to, otherwise return null.
 static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
   std::set<BasicBlock*> Visited;
@@ -546,39 +563,39 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
   BasicBlock *Header = currentLoop->getHeader();
   TerminatorInst *HeaderTerm = Header->getTerminator();
   LLVMContext &Context = Header->getContext();
-  
+
   BasicBlock *LoopExitBB = 0;
   if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) {
     // If the header block doesn't end with a conditional branch on Cond, we
     // can't handle it.
     if (!BI->isConditional() || BI->getCondition() != Cond)
       return false;
-  
-    // Check to see if a successor of the branch is guaranteed to 
-    // exit through a unique exit block without having any 
+
+    // Check to see if a successor of the branch is guaranteed to
+    // exit through a unique exit block without having any
     // side-effects.  If so, determine the value of Cond that causes it to do
     // this.
-    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
                                              BI->getSuccessor(0)))) {
       if (Val) *Val = ConstantInt::getTrue(Context);
-    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, 
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
                                                     BI->getSuccessor(1)))) {
       if (Val) *Val = ConstantInt::getFalse(Context);
     }
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) {
     // If this isn't a switch on Cond, we can't handle it.
     if (SI->getCondition() != Cond) return false;
-    
+
     // Check to see if a successor of the switch is guaranteed to go to the
-    // latch block or exit through a one exit block without having any 
+    // latch block or exit through a one exit block without having any
     // side-effects.  If so, determine the value of Cond that causes it to do
-    // this. 
+    // this.
     // Note that we can't trivially unswitch on the default case or
     // on already unswitched cases.
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
       BasicBlock* LoopExitCandidate;
-      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, 
+      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
                                                i.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
         ConstantInt* CaseVal = i.getCaseValue();
@@ -598,9 +615,9 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
   // contains phi nodes, this isn't trivial.
   if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
     return false;   // Can't handle this.
-  
+
   if (LoopExit) *LoopExit = LoopExitBB;
-  
+
   // We already know that nothing uses any scalar values defined inside of this
   // loop.  As such, we just have to check to see if this loop will execute any
   // side-effecting instructions (e.g. stores, calls, volatile loads) in the
@@ -686,17 +703,17 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
 
 /// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
 /// condition in it (a cond branch from its header block to its latch block,
-/// where the path through the loop that doesn't execute its body has no 
+/// where the path through the loop that doesn't execute its body has no
 /// side-effects), unswitch it.  This doesn't involve any code duplication, just
 /// moving the conditional branch outside of the loop and updating loop info.
-void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, 
-                                            Constant *Val, 
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
+                                            Constant *Val,
                                             BasicBlock *ExitBlock) {
   DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
         << loopHeader->getName() << " [" << L->getBlocks().size()
         << " blocks] in Function " << L->getHeader()->getParent()->getName()
         << " on cond: " << *Val << " == " << *Cond << "\n");
-  
+
   // First step, split the preheader, so that we know that there is a safe place
   // to insert the conditional branch.  We will change loopPreheader to have a
   // conditional branch on Cond.
@@ -705,24 +722,24 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
   // Now that we have a place to insert the conditional branch, create a place
   // to branch to: this is the exit block out of the loop that we should
   // short-circuit to.
-  
+
   // Split this block now, so that the loop maintains its exit block, and so
   // that the jump from the preheader can execute the contents of the exit block
   // without actually branching to it (the exit block should be dominated by the
   // loop header, not the preheader).
   assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
   BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this);
-    
-  // Okay, now we have a position to branch from and a position to branch to, 
+
+  // Okay, now we have a position to branch from and a position to branch to,
   // insert the new conditional branch.
-  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, 
+  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH,
                                  loopPreheader->getTerminator());
   LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
   loopPreheader->getTerminator()->eraseFromParent();
 
   // We need to reprocess this loop, it could be unswitched again.
   redoLoop = true;
-  
+
   // Now that we know that the loop is never entered when this condition is a
   // particular value, rewrite the loop with this info.  We know that this will
   // at least eliminate the old branch.
@@ -732,7 +749,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
 
 /// SplitExitEdges - Split all of the edges from inside the loop to their exit
 /// blocks.  Update the appropriate Phi nodes as we do so.
-void LoopUnswitch::SplitExitEdges(Loop *L, 
+void LoopUnswitch::SplitExitEdges(Loop *L,
                                 const SmallVector<BasicBlock *, 8> &ExitBlocks){
 
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
@@ -752,10 +769,10 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
   }
 }
 
-/// UnswitchNontrivialCondition - We determined that the loop is profitable 
-/// to unswitch when LIC equal Val.  Split it into loop versions and test the 
+/// UnswitchNontrivialCondition - We determined that the loop is profitable
+/// to unswitch when LIC equal Val.  Split it into loop versions and test the
 /// condition outside of either loop.  Return the loops created as Out1/Out2.
-void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, 
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
                                                Loop *L) {
   Function *F = loopHeader->getParent();
   DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
@@ -798,7 +815,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   ValueToValueMapTy VMap;
   for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
     BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
-    
+
     NewBlocks.push_back(NewBB);
     VMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
     LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
@@ -828,7 +845,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     // The new exit block should be in the same loop as the old one.
     if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
       ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase());
-    
+
     assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
            "Exit block should have been split to have one successor!");
     BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
@@ -863,7 +880,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     for (BasicBlock::iterator I = NewBlocks[i]->begin(),
            E = NewBlocks[i]->end(); I != E; ++I)
       RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
-  
+
   // Rewrite the original preheader to select between versions of the loop.
   BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
   assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
@@ -882,7 +899,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   // the condition that we're unswitching on), we don't rewrite the second
   // iteration.
   WeakVH LICHandle(LIC);
-  
+
   // Now we rewrite the original code to know that the condition is true and the
   // new code to know that the condition is false.
   RewriteLoopBodyWithConditionConstant(L, LIC, Val, false);
@@ -897,7 +914,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
 /// RemoveFromWorklist - Remove all instances of I from the worklist vector
 /// specified.
-static void RemoveFromWorklist(Instruction *I, 
+static void RemoveFromWorklist(Instruction *I,
                                std::vector<Instruction*> &Worklist) {
   std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(),
                                                      Worklist.end(), I);
@@ -910,7 +927,7 @@ static void RemoveFromWorklist(Instruction *I,
 
 /// ReplaceUsesOfWith - When we find that I really equals V, remove I from the
 /// program, replacing all uses with V and update the worklist.
-static void ReplaceUsesOfWith(Instruction *I, Value *V, 
+static void ReplaceUsesOfWith(Instruction *I, Value *V,
                               std::vector<Instruction*> &Worklist,
                               Loop *L, LPPassManager *LPM) {
   DEBUG(dbgs() << "Replace with '" << *V << "': " << *I);
@@ -943,10 +960,10 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
     if (BasicBlock *Pred = BB->getSinglePredecessor()) {
       // If it has one pred, fold phi nodes in BB.
       while (isa<PHINode>(BB->begin()))
-        ReplaceUsesOfWith(BB->begin(), 
-                          cast<PHINode>(BB->begin())->getIncomingValue(0), 
+        ReplaceUsesOfWith(BB->begin(),
+                          cast<PHINode>(BB->begin())->getIncomingValue(0),
                           Worklist, L, LPM);
-      
+
       // If this is the header of a loop and the only pred is the latch, we now
       // have an unreachable loop.
       if (Loop *L = LI->getLoopFor(BB))
@@ -957,15 +974,15 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
           LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L);
           Pred->getTerminator()->eraseFromParent();
           new UnreachableInst(BB->getContext(), Pred);
-          
+
           // The loop is now broken, remove it from LI.
           RemoveLoopFromHierarchy(L);
-          
+
           // Reprocess the header, which now IS dead.
           RemoveBlockIfDead(BB, Worklist, L);
           return;
         }
-      
+
       // If pred ends in a uncond branch, add uncond branch to worklist so that
       // the two blocks will get merged.
       if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
@@ -976,11 +993,11 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
   }
 
   DEBUG(dbgs() << "Nuking dead block: " << *BB);
-  
+
   // Remove the instructions in the basic block from the worklist.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     RemoveFromWorklist(I, Worklist);
-    
+
     // Anything that uses the instructions in this basic block should have their
     // uses replaced with undefs.
     // If I is not void type then replaceAllUsesWith undef.
@@ -988,7 +1005,7 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
     if (!I->getType()->isVoidTy())
       I->replaceAllUsesWith(UndefValue::get(I->getType()));
   }
-  
+
   // If this is the edge to the header block for a loop, remove the loop and
   // promote all subloops.
   if (Loop *BBLoop = LI->getLoopFor(BB)) {
@@ -1004,8 +1021,8 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
   // Remove the block from the loop info, which removes it from any loops it
   // was in.
   LI->removeBlock(BB);
-  
-  
+
+
   // Remove phi node entries in successors for this block.
   TerminatorInst *TI = BB->getTerminator();
   SmallVector<BasicBlock*, 4> Succs;
@@ -1013,13 +1030,13 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
     Succs.push_back(TI->getSuccessor(i));
     TI->getSuccessor(i)->removePredecessor(BB);
   }
-  
+
   // Unique the successors, remove anything with multiple uses.
   array_pod_sort(Succs.begin(), Succs.end());
   Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
-  
+
   // Remove the basic block, including all of the instructions contained in it.
-  LPM->deleteSimpleAnalysisValue(BB, L);  
+  LPM->deleteSimpleAnalysisValue(BB, L);
   BB->eraseFromParent();
   // Remove successor blocks here that are not dead, so that we know we only
   // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
@@ -1037,7 +1054,7 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
         --i;
       }
     }
-  
+
   for (unsigned i = 0, e = Succs.size(); i != e; ++i)
     RemoveBlockIfDead(Succs[i], Worklist, L);
 }
@@ -1060,14 +1077,14 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
                                                         Constant *Val,
                                                         bool IsEqual) {
   assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
-  
+
   // FIXME: Support correlated properties, like:
   //  for (...)
   //    if (li1 < li2)
   //      ...
   //    if (li1 > li2)
   //      ...
-  
+
   // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
   // selects, switches.
   std::vector<Instruction*> Worklist;
@@ -1082,9 +1099,9 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     if (IsEqual)
       Replacement = Val;
     else
-      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), 
+      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
                                      !cast<ConstantInt>(Val)->getZExtValue());
-    
+
     for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
          UI != E; ++UI) {
       Instruction *U = dyn_cast<Instruction>(*UI);
@@ -1092,15 +1109,15 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
         continue;
       Worklist.push_back(U);
     }
-    
+
     for (std::vector<Instruction*>::iterator UI = Worklist.begin();
          UI != Worklist.end(); ++UI)
-      (*UI)->replaceUsesOfWith(LIC, Replacement);        
-    
+      (*UI)->replaceUsesOfWith(LIC, Replacement);
+
     SimplifyCode(Worklist, L);
     return;
   }
-  
+
   // Otherwise, we don't know the precise value of LIC, but we do know that it
   // is certainly NOT "Val".  As such, simplify any uses in the loop that we
   // can.  This case occurs when we unswitch switch statements.
@@ -1112,27 +1129,27 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
 
     Worklist.push_back(U);
 
-    // TODO: We could do other simplifications, for example, turning 
+    // TODO: We could do other simplifications, for example, turning
     // 'icmp eq LIC, Val' -> false.
 
     // If we know that LIC is not Val, use this info to simplify code.
     SwitchInst *SI = dyn_cast<SwitchInst>(U);
     if (SI == 0 || !isa<ConstantInt>(Val)) continue;
-    
+
     SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
     // Default case is live for multiple values.
     if (DeadCase == SI->case_default()) continue;
-    
-    // Found a dead case value.  Don't remove PHI nodes in the 
+
+    // Found a dead case value.  Don't remove PHI nodes in the
     // successor if they become single-entry, those PHI nodes may
     // be in the Users list.
 
     BasicBlock *Switch = SI->getParent();
     BasicBlock *SISucc = DeadCase.getCaseSuccessor();
     BasicBlock *Latch = L->getLoopLatch();
-    
+
     BranchesInfo.setUnswitched(SI, Val);
-    
+
     if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
     // If the DeadCase successor dominates the loop latch, then the
     // transformation isn't safe since it will delete the sole predecessor edge
@@ -1172,7 +1189,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     if (DT)
       DT->addNewBlock(Abort, NewSISucc);
   }
-  
+
   SimplifyCode(Worklist, L);
 }
 
@@ -1193,7 +1210,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     // Simple DCE.
     if (isInstructionTriviallyDead(I)) {
       DEBUG(dbgs() << "Remove dead instruction '" << *I);
-      
+
       // Add uses to the worklist, which may be dead now.
       for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
         if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
@@ -1225,24 +1242,24 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         if (!SinglePred) continue;  // Nothing to do.
         assert(SinglePred == Pred && "CFG broken");
 
-        DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " 
+        DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
               << Succ->getName() << "\n");
-        
+
         // Resolve any single entry PHI nodes in Succ.
         while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
           ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
-        
+
         // If Succ has any successors with PHI nodes, update them to have
         // entries coming from Pred instead of Succ.
         Succ->replaceAllUsesWith(Pred);
-        
+
         // Move all of the successor contents from Succ to Pred.
         Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(),
                                    Succ->end());
         LPM->deleteSimpleAnalysisValue(BI, L);
         BI->eraseFromParent();
         RemoveFromWorklist(BI, Worklist);
-        
+
         // Remove Succ from the loop tree.
         LI->removeBlock(Succ);
         LPM->deleteSimpleAnalysisValue(Succ, L);
@@ -1250,7 +1267,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         ++NumSimplify;
         continue;
       }
-      
+
       if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
         // Conditional branch.  Turn it into an unconditional branch, then
         // remove dead blocks.
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index 9fdea8d..29234da 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -162,6 +162,7 @@ namespace {
     IC_MoveWeak,            ///< objc_moveWeak (derived)
     IC_CopyWeak,            ///< objc_copyWeak (derived)
     IC_DestroyWeak,         ///< objc_destroyWeak (derived)
+    IC_StoreStrong,         ///< objc_storeStrong (derived)
     IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
     IC_Call,                ///< could call objc_release
     IC_User,                ///< could "use" a pointer
@@ -262,6 +263,7 @@ static InstructionClass GetFunctionClass(const Function *F) {
               return StringSwitch<InstructionClass>(F->getName())
                      .Case("objc_storeWeak",             IC_StoreWeak)
                      .Case("objc_initWeak",              IC_InitWeak)
+                     .Case("objc_storeStrong",           IC_StoreStrong)
                      .Default(IC_CallOrUser);
             // Second argument is i8**.
             if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
@@ -618,22 +620,35 @@ static bool DoesObjCBlockEscape(const Value *BlockPtr) {
       const User *UUser = *UI;
       // Special - Use by a call (callee or argument) is not considered
       // to be an escape.
-      if (isa<CallInst>(UUser) || isa<InvokeInst>(UUser))
-        continue;
-      // Use by an instruction which copies the value is an escape if the
-      // result is an escape.
-      if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
-          isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
-        Worklist.push_back(UUser);
+      switch (GetBasicInstructionClass(UUser)) {
+      case IC_StoreWeak:
+      case IC_InitWeak:
+      case IC_StoreStrong:
+      case IC_Autorelease:
+      case IC_AutoreleaseRV:
+        // These special functions make copies of their pointer arguments.
+        return true;
+      case IC_User:
+      case IC_None:
+        // Use by an instruction which copies the value is an escape if the
+        // result is an escape.
+        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
+            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
+          Worklist.push_back(UUser);
+          continue;
+        }
+        // Use by a load is not an escape.
+        if (isa<LoadInst>(UUser))
+          continue;
+        // Use by a store is not an escape if the use is the address.
+        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
+          if (V != SI->getValueOperand())
+            continue;
+        break;
+      default:
+        // Regular calls and other stuff are not considered escapes.
         continue;
       }
-      // Use by a load is not an escape.
-      if (isa<LoadInst>(UUser))
-        continue;
-      // Use by a store is not an escape if the use is the address.
-      if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
-        if (V != SI->getValueOperand())
-          continue;
       // Otherwise, conservatively assume an escape.
       return true;
     }
@@ -883,7 +898,7 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
       // These calls return their argument verbatim, as a low-level
       // optimization. However, this makes high-level optimizations
       // harder. Undo any uses of this optimization that the front-end
-      // emitted here. We'll redo them in a later pass.
+      // emitted here. We'll redo them in the contract pass.
       Changed = true;
       Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0));
       break;
@@ -997,7 +1012,11 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
     return false;
 
   // Find the llvm.global_ctors variable, as the first step in
-  // identifying the global constructors.
+  // identifying the global constructors. In theory, unnecessary autorelease
+  // pools could occur anywhere, but in practice it's pretty rare. Global
+  // ctors are a place where autorelease pools get inserted automatically,
+  // so it's pretty common for them to be unnecessary, and it's pretty
+  // profitable to eliminate them.
   GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
   if (!GV)
     return false;
@@ -1014,7 +1033,11 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
     Value *Op = *OI;
     // llvm.global_ctors is an array of pairs where the second members
     // are constructor functions.
-    Function *F = cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    // If the user used a constructor function with the wrong signature and
+    // it got bitcasted or whatever, look the other way.
+    if (!F)
+      continue;
     // Only look at function definitions.
     if (F->isDeclaration())
       continue;
@@ -1678,9 +1701,16 @@ namespace {
     void CheckForCFGHazards(const BasicBlock *BB,
                             DenseMap<const BasicBlock *, BBState> &BBStates,
                             BBState &MyStates) const;
+    bool VisitInstructionBottomUp(Instruction *Inst,
+                                  BasicBlock *BB,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  BBState &MyStates);
     bool VisitBottomUp(BasicBlock *BB,
                        DenseMap<const BasicBlock *, BBState> &BBStates,
                        MapVector<Value *, RRInfo> &Retains);
+    bool VisitInstructionTopDown(Instruction *Inst,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 BBState &MyStates);
     bool VisitTopDown(BasicBlock *BB,
                       DenseMap<const BasicBlock *, BBState> &BBStates,
                       DenseMap<Value *, RRInfo> &Releases);
@@ -1956,6 +1986,7 @@ namespace {
   /// use here.
   enum DependenceKind {
     NeedsPositiveRetainCount,
+    AutoreleasePoolBoundary,
     CanChangeRetainCount,
     RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
     RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
@@ -1985,6 +2016,19 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
     }
   }
 
+  case AutoreleasePoolBoundary: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // These mark the end and begin of an autorelease pool scope.
+      return true;
+    default:
+      // Nothing else does this.
+      return false;
+    }
+  }
+
   case CanChangeRetainCount: {
     InstructionClass Class = GetInstructionClass(Inst);
     switch (Class) {
@@ -2002,6 +2046,7 @@ Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
   case RetainAutoreleaseDep:
     switch (GetBasicInstructionClass(Inst)) {
     case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
       // Don't merge an objc_autorelease with an objc_retain inside a different
       // autoreleasepool scope.
       return true;
@@ -2136,17 +2181,26 @@ ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
 /// return true.
 bool
 ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
-  // Check for the argument being from an immediately preceding call.
+  // Check for the argument being from an immediately preceding call or invoke.
   Value *Arg = GetObjCArg(RetainRV);
   CallSite CS(Arg);
-  if (Instruction *Call = CS.getInstruction())
+  if (Instruction *Call = CS.getInstruction()) {
     if (Call->getParent() == RetainRV->getParent()) {
       BasicBlock::iterator I = Call;
       ++I;
       while (isNoopInstruction(I)) ++I;
       if (&*I == RetainRV)
         return false;
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      BasicBlock *RetainRVParent = RetainRV->getParent();
+      if (II->getNormalDest() == RetainRVParent) {
+        BasicBlock::iterator I = RetainRVParent->begin();
+        while (isNoopInstruction(I)) ++I;
+        if (&*I == RetainRV)
+          return false;
+      }
     }
+  }
 
   // Check for being preceded by an objc_autoreleaseReturnValue on the same
   // pointer. In this case, we can delete the pair.
@@ -2232,6 +2286,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_DestroyWeak: {
       CallInst *CI = cast<CallInst>(Inst);
       if (isNullOrUndef(CI->getArgOperand(0))) {
+        Changed = true;
         Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
                       Constant::getNullValue(Ty),
@@ -2247,6 +2302,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       CallInst *CI = cast<CallInst>(Inst);
       if (isNullOrUndef(CI->getArgOperand(0)) ||
           isNullOrUndef(CI->getArgOperand(1))) {
+        Changed = true;
         Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
                       Constant::getNullValue(Ty),
@@ -2360,9 +2416,34 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
 
         // Check that there is nothing that cares about the reference
         // count between the call and the phi.
-        FindDependencies(NeedsPositiveRetainCount, Arg,
-                         Inst->getParent(), Inst,
-                         DependingInstructions, Visited, PA);
+        switch (Class) {
+        case IC_Retain:
+        case IC_RetainBlock:
+          // These can always be moved up.
+          break;
+        case IC_Release:
+          // These can't be moved across things that care about the retain count.
+          FindDependencies(NeedsPositiveRetainCount, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_Autorelease:
+          // These can't be moved across autorelease pool scope boundaries.
+          FindDependencies(AutoreleasePoolBoundary, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_RetainRV:
+        case IC_AutoreleaseRV:
+          // Don't move these; the RV optimization depends on the autoreleaseRV
+          // being tail called, and the retainRV being immediately after a call
+          // (which might still happen if we get lucky with codegen layout, but
+          // it's not worth taking the chance).
+          continue;
+        default:
+          llvm_unreachable("Invalid dependence flavor");
+        }
+
         if (DependingInstructions.size() == 1 &&
             *DependingInstructions.begin() == PN) {
           Changed = true;
@@ -2516,6 +2597,164 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
 }
 
 bool
+ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
+                                     BasicBlock *BB,
+                                     MapVector<Value *, RRInfo> &Retains,
+                                     BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+
+    // If we see two releases in a row on the same pointer. If so, make
+    // a note, and we'll cicle back to revisit it after we've
+    // hopefully eliminated the second release, which may allow us to
+    // eliminate the first release too.
+    // Theoretically we could implement removal of nested retain+release
+    // pairs by making PtrState hold a stack of states, but this is
+    // simple and avoids adding overhead for the non-nested case.
+    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
+      NestingDetected = true;
+
+    S.RRI.clear();
+
+    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+    S.SetSeq(ReleaseMetadata ? S_MovableRelease : S_Release);
+    S.RRI.ReleaseMetadata = ReleaseMetadata;
+    S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
+    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+    S.RRI.Calls.insert(Inst);
+
+    S.IncrementRefCount();
+    S.IncrementNestCount();
+    break;
+  }
+  case IC_RetainBlock:
+    // An objc_retainBlock call with just a use may need to be kept,
+    // because it may be copying a block from the stack to the heap.
+    if (!IsRetainBlockOptimizable(Inst))
+      break;
+    // FALLTHROUGH
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+    S.DecrementRefCount();
+    S.SetAtLeastOneRefCount();
+    S.DecrementNestCount();
+
+    switch (S.GetSeq()) {
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+    case S_Use:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_CanRelease:
+      // Don't do retain+release tracking for IC_RetainRV, because it's
+      // better to let it remain as the first instruction after a call.
+      if (Class != IC_RetainRV) {
+        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+        Retains[Inst] = S.RRI;
+      }
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+    return NestingDetected;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearBottomUpPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
+       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.DecrementRefCount();
+      switch (Seq) {
+      case S_Use:
+        S.SetSeq(S_CanRelease);
+        continue;
+      case S_CanRelease:
+      case S_Release:
+      case S_MovableRelease:
+      case S_Stop:
+      case S_None:
+        break;
+      case S_Retain:
+        llvm_unreachable("bottom-up pointer in retain state!");
+      }
+    }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_Release:
+    case S_MovableRelease:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        assert(S.RRI.ReverseInsertPts.empty());
+        // If this is an invoke instruction, we're scanning it as part of
+        // one of its successor blocks, since we can't insert code after it
+        // in its own block, and we don't want to split critical edges.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+        S.SetSeq(S_Use);
+      } else if (Seq == S_Release &&
+                 (Class == IC_User || Class == IC_CallOrUser)) {
+        // Non-movable releases depend on any possible objc pointer use.
+        S.SetSeq(S_Stop);
+        assert(S.RRI.ReverseInsertPts.empty());
+        // As above; handle invoke specially.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+      }
+      break;
+    case S_Stop:
+      if (CanUse(Inst, Ptr, PA, Class))
+        S.SetSeq(S_Use);
+      break;
+    case S_CanRelease:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
 ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
                           DenseMap<const BasicBlock *, BBState> &BBStates,
                           MapVector<Value *, RRInfo> &Retains) {
@@ -2560,144 +2799,164 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
   // Visit all the instructions, bottom-up.
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
     Instruction *Inst = llvm::prior(I);
-    InstructionClass Class = GetInstructionClass(Inst);
-    const Value *Arg = 0;
 
-    switch (Class) {
-    case IC_Release: {
-      Arg = GetObjCArg(Inst);
+    // Invoke instructions are visited as part of their successors (below).
+    if (isa<InvokeInst>(Inst))
+      continue;
+
+    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
+  }
+
+  // If there's a predecessor with an invoke, visit the invoke as
+  // if it were part of this block, since we can't insert code after
+  // an invoke in its own block, and we don't want to split critical
+  // edges.
+  for (pred_iterator PI(BB), PE(BB, false); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    TerminatorInst *PredTI = cast<TerminatorInst>(&Pred->back());
+    if (isa<InvokeInst>(PredTI))
+      NestingDetected |= VisitInstructionBottomUp(PredTI, BB, Retains, MyStates);
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
+                                    DenseMap<Value *, RRInfo> &Releases,
+                                    BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_RetainBlock:
+    // An objc_retainBlock call with just a use may need to be kept,
+    // because it may be copying a block from the stack to the heap.
+    if (!IsRetainBlockOptimizable(Inst))
+      break;
+    // FALLTHROUGH
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
 
-      PtrState &S = MyStates.getPtrBottomUpState(Arg);
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
 
-      // If we see two releases in a row on the same pointer. If so, make
+    // Don't do retain+release tracking for IC_RetainRV, because it's
+    // better to let it remain as the first instruction after a call.
+    if (Class != IC_RetainRV) {
+      // If we see two retains in a row on the same pointer. If so, make
       // a note, and we'll cicle back to revisit it after we've
-      // hopefully eliminated the second release, which may allow us to
-      // eliminate the first release too.
+      // hopefully eliminated the second retain, which may allow us to
+      // eliminate the first retain too.
       // Theoretically we could implement removal of nested retain+release
       // pairs by making PtrState hold a stack of states, but this is
       // simple and avoids adding overhead for the non-nested case.
-      if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
+      if (S.GetSeq() == S_Retain)
         NestingDetected = true;
 
+      S.SetSeq(S_Retain);
       S.RRI.clear();
-
-      MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-      S.SetSeq(ReleaseMetadata ? S_MovableRelease : S_Release);
-      S.RRI.ReleaseMetadata = ReleaseMetadata;
-      S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
-      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+      S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+      // Don't check S.IsKnownIncremented() here because it's not
+      // sufficient.
+      S.RRI.KnownSafe = S.IsKnownNested();
       S.RRI.Calls.insert(Inst);
+    }
 
-      S.IncrementRefCount();
-      S.IncrementNestCount();
+    S.SetAtLeastOneRefCount();
+    S.IncrementRefCount();
+    S.IncrementNestCount();
+    return NestingDetected;
+  }
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+    S.DecrementRefCount();
+    S.DecrementNestCount();
+
+    switch (S.GetSeq()) {
+    case S_Retain:
+    case S_CanRelease:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_Use:
+      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+      Releases[Inst] = S.RRI;
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
       break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
     }
-    case IC_RetainBlock:
-      // An objc_retainBlock call with just a use may need to be kept,
-      // because it may be copying a block from the stack to the heap.
-      if (!IsRetainBlockOptimizable(Inst))
-        break;
-      // FALLTHROUGH
-    case IC_Retain:
-    case IC_RetainRV: {
-      Arg = GetObjCArg(Inst);
+    break;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearTopDownPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
 
-      PtrState &S = MyStates.getPtrBottomUpState(Arg);
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
+       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
       S.DecrementRefCount();
-      S.SetAtLeastOneRefCount();
-      S.DecrementNestCount();
+      switch (Seq) {
+      case S_Retain:
+        S.SetSeq(S_CanRelease);
+        assert(S.RRI.ReverseInsertPts.empty());
+        S.RRI.ReverseInsertPts.insert(Inst);
 
-      switch (S.GetSeq()) {
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
+        // One call can't cause a transition from S_Retain to S_CanRelease
+        // and S_CanRelease to S_Use. If we've made the first transition,
+        // we're done.
+        continue;
       case S_Use:
-        S.RRI.ReverseInsertPts.clear();
-        // FALL THROUGH
       case S_CanRelease:
-        // Don't do retain+release tracking for IC_RetainRV, because it's
-        // better to let it remain as the first instruction after a call.
-        if (Class != IC_RetainRV) {
-          S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-          Retains[Inst] = S.RRI;
-        }
-        S.ClearSequenceProgress();
-        break;
       case S_None:
         break;
-      case S_Retain:
-        llvm_unreachable("bottom-up pointer in retain state!");
-      }
-      continue;
-    }
-    case IC_AutoreleasepoolPop:
-      // Conservatively, clear MyStates for all known pointers.
-      MyStates.clearBottomUpPointers();
-      continue;
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      // These are irrelevant.
-      continue;
-    default:
-      break;
-    }
-
-    // Consider any other possible effects of this instruction on each
-    // pointer being tracked.
-    for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
-         ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
-      const Value *Ptr = MI->first;
-      if (Ptr == Arg)
-        continue; // Handled above.
-      PtrState &S = MI->second;
-      Sequence Seq = S.GetSeq();
-
-      // Check for possible releases.
-      if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        S.DecrementRefCount();
-        switch (Seq) {
-        case S_Use:
-          S.SetSeq(S_CanRelease);
-          continue;
-        case S_CanRelease:
-        case S_Release:
-        case S_MovableRelease:
-        case S_Stop:
-        case S_None:
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-
-      // Check for possible direct uses.
-      switch (Seq) {
+      case S_Stop:
       case S_Release:
       case S_MovableRelease:
-        if (CanUse(Inst, Ptr, PA, Class)) {
-          assert(S.RRI.ReverseInsertPts.empty());
-          S.RRI.ReverseInsertPts.insert(Inst);
-          S.SetSeq(S_Use);
-        } else if (Seq == S_Release &&
-                   (Class == IC_User || Class == IC_CallOrUser)) {
-          // Non-movable releases depend on any possible objc pointer use.
-          S.SetSeq(S_Stop);
-          assert(S.RRI.ReverseInsertPts.empty());
-          S.RRI.ReverseInsertPts.insert(Inst);
-        }
-        break;
-      case S_Stop:
-        if (CanUse(Inst, Ptr, PA, Class))
-          S.SetSeq(S_Use);
-        break;
-      case S_CanRelease:
-      case S_Use:
-      case S_None:
-        break;
-      case S_Retain:
-        llvm_unreachable("bottom-up pointer in retain state!");
+        llvm_unreachable("top-down pointer in release state!");
       }
     }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_CanRelease:
+      if (CanUse(Inst, Ptr, PA, Class))
+        S.SetSeq(S_Use);
+      break;
+    case S_Retain:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
   }
 
   return NestingDetected;
@@ -2751,138 +3010,7 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
   // Visit all the instructions, top-down.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     Instruction *Inst = I;
-    InstructionClass Class = GetInstructionClass(Inst);
-    const Value *Arg = 0;
-
-    switch (Class) {
-    case IC_RetainBlock:
-      // An objc_retainBlock call with just a use may need to be kept,
-      // because it may be copying a block from the stack to the heap.
-      if (!IsRetainBlockOptimizable(Inst))
-        break;
-      // FALLTHROUGH
-    case IC_Retain:
-    case IC_RetainRV: {
-      Arg = GetObjCArg(Inst);
-
-      PtrState &S = MyStates.getPtrTopDownState(Arg);
-
-      // Don't do retain+release tracking for IC_RetainRV, because it's
-      // better to let it remain as the first instruction after a call.
-      if (Class != IC_RetainRV) {
-        // If we see two retains in a row on the same pointer. If so, make
-        // a note, and we'll cicle back to revisit it after we've
-        // hopefully eliminated the second retain, which may allow us to
-        // eliminate the first retain too.
-        // Theoretically we could implement removal of nested retain+release
-        // pairs by making PtrState hold a stack of states, but this is
-        // simple and avoids adding overhead for the non-nested case.
-        if (S.GetSeq() == S_Retain)
-          NestingDetected = true;
-
-        S.SetSeq(S_Retain);
-        S.RRI.clear();
-        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-        // Don't check S.IsKnownIncremented() here because it's not
-        // sufficient.
-        S.RRI.KnownSafe = S.IsKnownNested();
-        S.RRI.Calls.insert(Inst);
-      }
-
-      S.SetAtLeastOneRefCount();
-      S.IncrementRefCount();
-      S.IncrementNestCount();
-      continue;
-    }
-    case IC_Release: {
-      Arg = GetObjCArg(Inst);
-
-      PtrState &S = MyStates.getPtrTopDownState(Arg);
-      S.DecrementRefCount();
-      S.DecrementNestCount();
-
-      switch (S.GetSeq()) {
-      case S_Retain:
-      case S_CanRelease:
-        S.RRI.ReverseInsertPts.clear();
-        // FALL THROUGH
-      case S_Use:
-        S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-        S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-        Releases[Inst] = S.RRI;
-        S.ClearSequenceProgress();
-        break;
-      case S_None:
-        break;
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
-        llvm_unreachable("top-down pointer in release state!");
-      }
-      break;
-    }
-    case IC_AutoreleasepoolPop:
-      // Conservatively, clear MyStates for all known pointers.
-      MyStates.clearTopDownPointers();
-      continue;
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      // These are irrelevant.
-      continue;
-    default:
-      break;
-    }
-
-    // Consider any other possible effects of this instruction on each
-    // pointer being tracked.
-    for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
-         ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
-      const Value *Ptr = MI->first;
-      if (Ptr == Arg)
-        continue; // Handled above.
-      PtrState &S = MI->second;
-      Sequence Seq = S.GetSeq();
-
-      // Check for possible releases.
-      if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        S.DecrementRefCount();
-        switch (Seq) {
-        case S_Retain:
-          S.SetSeq(S_CanRelease);
-          assert(S.RRI.ReverseInsertPts.empty());
-          S.RRI.ReverseInsertPts.insert(Inst);
-
-          // One call can't cause a transition from S_Retain to S_CanRelease
-          // and S_CanRelease to S_Use. If we've made the first transition,
-          // we're done.
-          continue;
-        case S_Use:
-        case S_CanRelease:
-        case S_None:
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-          llvm_unreachable("top-down pointer in release state!");
-        }
-      }
-
-      // Check for possible direct uses.
-      switch (Seq) {
-      case S_CanRelease:
-        if (CanUse(Inst, Ptr, PA, Class))
-          S.SetSeq(S_Use);
-        break;
-      case S_Retain:
-      case S_Use:
-      case S_None:
-        break;
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
-        llvm_unreachable("top-down pointer in release state!");
-      }
-    }
+    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
   }
 
   CheckForCFGHazards(BB, BBStates, MyStates);
@@ -3032,35 +3160,17 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
   for (SmallPtrSet<Instruction *, 2>::const_iterator
        PI = RetainsToMove.ReverseInsertPts.begin(),
        PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *LastUse = *PI;
-    Instruction *InsertPts[] = { 0, 0, 0 };
-    if (InvokeInst *II = dyn_cast<InvokeInst>(LastUse)) {
-      // We can't insert code immediately after an invoke instruction, so
-      // insert code at the beginning of both successor blocks instead.
-      // The invoke's return value isn't available in the unwind block,
-      // but our releases will never depend on it, because they must be
-      // paired with retains from before the invoke.
-      InsertPts[0] = II->getNormalDest()->getFirstInsertionPt();
-      if (!II->getMetadata(NoObjCARCExceptionsMDKind))
-        InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt();
-    } else {
-      // Insert code immediately after the last use.
-      InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse));
-    }
-
-    for (Instruction **I = InsertPts; *I; ++I) {
-      Instruction *InsertPt = *I;
-      Value *MyArg = ArgTy == ParamTy ? Arg :
-                     new BitCastInst(Arg, ParamTy, "", InsertPt);
-      CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
-                                        "", InsertPt);
-      // Attach a clang.imprecise_release metadata tag, if appropriate.
-      if (MDNode *M = ReleasesToMove.ReleaseMetadata)
-        Call->setMetadata(ImpreciseReleaseMDKind, M);
-      Call->setDoesNotThrow();
-      if (ReleasesToMove.IsTailCallRelease)
-        Call->setTailCall();
-    }
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
+                                      "", InsertPt);
+    // Attach a clang.imprecise_release metadata tag, if appropriate.
+    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+      Call->setMetadata(ImpreciseReleaseMDKind, M);
+    Call->setDoesNotThrow();
+    if (ReleasesToMove.IsTailCallRelease)
+      Call->setTailCall();
   }
 
   // Delete the original retain and release calls.
@@ -3080,6 +3190,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
   }
 }
 
+/// PerformCodePlacement - Identify pairings between the retains and releases,
+/// and delete and/or move them.
 bool
 ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
                                    &BBStates,
@@ -3093,6 +3205,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
   SmallVector<Instruction *, 4> NewReleases;
   SmallVector<Instruction *, 8> DeadInsts;
 
+  // Visit each retain.
   for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
        E = Retains.end(); I != E; ++I) {
     Value *V = I->first;
@@ -3566,6 +3679,7 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   if (!EnableARCOpts)
     return false;
 
+  // If nothing in the Module uses ARC, don't do anything.
   Run = ModuleHasARC(M);
   if (!Run)
     return false;
@@ -3900,6 +4014,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
 }
 
 bool ObjCARCContract::doInitialization(Module &M) {
+  // If nothing in the Module uses ARC, don't do anything.
   Run = ModuleHasARC(M);
   if (!Run)
     return false;
@@ -3975,6 +4090,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
       --BBI;
       while (isNoopInstruction(BBI)) --BBI;
       if (&*BBI == GetObjCArg(Inst)) {
+        Changed = true;
         InlineAsm *IA =
           InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
                                            /*isVarArg=*/false),
@@ -4024,16 +4140,19 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         Use &U = UI.getUse();
         unsigned OperandNo = UI.getOperandNo();
         ++UI; // Increment UI now, because we may unlink its element.
-        Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
-        if (!UserInst)
-          continue;
-        // FIXME: dominates should return true for unreachable UserInst.
-        if (!DT->isReachableFromEntry(UserInst->getParent()) ||
-            DT->dominates(Inst, UserInst)) {
+
+        // If the call's return value dominates a use of the call's argument
+        // value, rewrite the use to use the return value. We check for
+        // reachability here because an unreachable call is considered to
+        // trivially dominate itself, which would lead us to rewriting its
+        // argument in terms of its return value, which would lead to
+        // infinite loops in GetObjCArg.
+        if (DT->isReachableFromEntry(U) &&
+            DT->dominates(Inst, U)) {
           Changed = true;
           Instruction *Replacement = Inst;
           Type *UseTy = U.get()->getType();
-          if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) {
+          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
             // For PHI nodes, insert the bitcast in the predecessor block.
             unsigned ValNo =
               PHINode::getIncomingValueNumForOperand(OperandNo);
@@ -4042,6 +4161,9 @@ bool ObjCARCContract::runOnFunction(Function &F) {
             if (Replacement->getType() != UseTy)
               Replacement = new BitCastInst(Replacement, UseTy, "",
                                             &BB->back());
+            // While we're here, rewrite all edges for this PHI, rather
+            // than just one use at a time, to minimize the number of
+            // bitcasts we emit.
             for (unsigned i = 0, e = PHI->getNumIncomingValues();
                  i != e; ++i)
               if (PHI->getIncomingBlock(i) == BB) {
@@ -4054,7 +4176,8 @@ bool ObjCARCContract::runOnFunction(Function &F) {
               }
           } else {
             if (Replacement->getType() != UseTy)
-              Replacement = new BitCastInst(Replacement, UseTy, "", UserInst);
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            cast<Instruction>(U.getUser()));
             U.set(Replacement);
           }
         }
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 8f98a5b..cb408a1 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -74,7 +74,7 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
 namespace {
   class Reassociate : public FunctionPass {
     DenseMap<BasicBlock*, unsigned> RankMap;
-    DenseMap<AssertingVH<>, unsigned> ValueRankMap;
+    DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
     SmallVector<WeakVH, 8> RedoInsts;
     SmallVector<WeakVH, 8> DeadInsts;
     bool MadeChange;
@@ -210,7 +210,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
 /// LowerNegateToMultiply - Replace 0-X with X*-1.
 ///
 static Instruction *LowerNegateToMultiply(Instruction *Neg,
-                              DenseMap<AssertingVH<>, unsigned> &ValueRankMap) {
+                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
   Constant *Cst = Constant::getAllOnesValue(Neg->getType());
 
   Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
@@ -492,7 +492,7 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) {
 /// only used by an add, transform this into (X+(0-Y)) to promote better
 /// reassociation.
 static Instruction *BreakUpSubtract(Instruction *Sub,
-                              DenseMap<AssertingVH<>, unsigned> &ValueRankMap) {
+                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
   // Convert a subtract into an add and a neg instruction. This allows sub
   // instructions to be commuted with other add instructions.
   //
@@ -517,8 +517,8 @@ static Instruction *BreakUpSubtract(Instruction *Sub,
 /// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used
 /// by one, change this into a multiply by a constant to assist with further
 /// reassociation.
-static Instruction *ConvertShiftToMul(Instruction *Shl, 
-                              DenseMap<AssertingVH<>, unsigned> &ValueRankMap) {
+static Instruction *ConvertShiftToMul(Instruction *Shl,
+                         DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) {
   // If an operand of this shift is a reassociable multiply, or if the shift
   // is used by a reassociable multiply or add, turn into a multiply.
   if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) ||
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 5ce82b9..16b64a5 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1925,8 +1925,8 @@ bool IPSCCP::runOnModule(Module &M) {
     ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
   }
 
-  // If we inferred constant or undef values for globals variables, we can delete
-  // the global and any stores that remain to it.
+  // If we inferred constant or undef values for globals variables, we can
+  // delete the global and any stores that remain to it.
   const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
   for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
          E = TG.end(); I != E; ++I) {
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index d36a18f..026fea1 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -13,7 +13,7 @@
 // each member (if possible).  Then, if possible, it transforms the individual
 // alloca instructions into nice clean scalar SSA form.
 //
-// This combines a simple SRoA algorithm with the Mem2Reg algorithm because
+// This combines a simple SRoA algorithm with the Mem2Reg algorithm because they
 // often interact, especially for C++ programs.  As such, iterating between
 // SRoA, then Mem2Reg until we run out of things to promote works well.
 //
@@ -574,8 +574,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
     // transform it into a store of the expanded constant value.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
       assert(MSI->getRawDest() == Ptr && "Consistency error!");
-      signed SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
-      if (SNumBytes > 0) {
+      int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
+      if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
         unsigned NumBytes = static_cast<unsigned>(SNumBytes);
         unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
 
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 9c49ec1..f7b6941 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -1583,21 +1583,16 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["llvm.exp2.f64"] = &Exp2;
   Optimizations["llvm.exp2.f32"] = &Exp2;
 
-#ifdef HAVE_FLOORF
-  Optimizations["floor"] = &UnaryDoubleFP;
-#endif
-#ifdef HAVE_CEILF
-  Optimizations["ceil"] = &UnaryDoubleFP;
-#endif
-#ifdef HAVE_ROUNDF
-  Optimizations["round"] = &UnaryDoubleFP;
-#endif
-#ifdef HAVE_RINTF
-  Optimizations["rint"] = &UnaryDoubleFP;
-#endif
-#ifdef HAVE_NEARBYINTF
-  Optimizations["nearbyint"] = &UnaryDoubleFP;
-#endif
+  if (TLI->has(LibFunc::floor) && TLI->has(LibFunc::floorf))
+    Optimizations["floor"] = &UnaryDoubleFP;
+  if (TLI->has(LibFunc::ceil) && TLI->has(LibFunc::ceilf))
+    Optimizations["ceil"] = &UnaryDoubleFP;
+  if (TLI->has(LibFunc::round) && TLI->has(LibFunc::roundf))
+    Optimizations["round"] = &UnaryDoubleFP;
+  if (TLI->has(LibFunc::rint) && TLI->has(LibFunc::rintf))
+    Optimizations["rint"] = &UnaryDoubleFP;
+  if (TLI->has(LibFunc::nearbyint) && TLI->has(LibFunc::nearbyintf))
+    Optimizations["nearbyint"] = &UnaryDoubleFP;
 
   // Integer Optimizations
   Optimizations["ffs"] = &FFS;
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 1b28c35..20052a4 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -23,8 +23,11 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Metadata.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include <map>
@@ -197,7 +200,6 @@ namespace {
     const Function *OldFunc;
     ValueToValueMapTy &VMap;
     bool ModuleLevelChanges;
-    SmallVectorImpl<ReturnInst*> &Returns;
     const char *NameSuffix;
     ClonedCodeInfo *CodeInfo;
     const TargetData *TD;
@@ -205,24 +207,18 @@ namespace {
     PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
                           ValueToValueMapTy &valueMap,
                           bool moduleLevelChanges,
-                          SmallVectorImpl<ReturnInst*> &returns,
                           const char *nameSuffix, 
                           ClonedCodeInfo *codeInfo,
                           const TargetData *td)
     : NewFunc(newFunc), OldFunc(oldFunc),
       VMap(valueMap), ModuleLevelChanges(moduleLevelChanges),
-      Returns(returns), NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
+      NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
     }
 
     /// CloneBlock - The specified block is found to be reachable, clone it and
     /// anything that it can reach.
     void CloneBlock(const BasicBlock *BB,
                     std::vector<const BasicBlock*> &ToClone);
-    
-  public:
-    /// ConstantFoldMappedInstruction - Constant fold the specified instruction,
-    /// mapping its operands through VMap if they are available.
-    Constant *ConstantFoldMappedInstruction(const Instruction *I);
   };
 }
 
@@ -230,7 +226,7 @@ namespace {
 /// anything that it can reach.
 void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
                                        std::vector<const BasicBlock*> &ToClone){
-  TrackingVH<Value> &BBEntry = VMap[BB];
+  WeakVH &BBEntry = VMap[BB];
 
   // Have we already cloned this block?
   if (BBEntry) return;
@@ -262,19 +258,33 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   // loop doesn't include the terminator.
   for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end();
        II != IE; ++II) {
-    // If this instruction constant folds, don't bother cloning the instruction,
-    // instead, just add the constant to the value map.
-    if (Constant *C = ConstantFoldMappedInstruction(II)) {
-      VMap[II] = C;
-      continue;
+    Instruction *NewInst = II->clone();
+
+    // Eagerly remap operands to the newly cloned instruction, except for PHI
+    // nodes for which we defer processing until we update the CFG.
+    if (!isa<PHINode>(NewInst)) {
+      RemapInstruction(NewInst, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+
+      // If we can simplify this instruction to some other value, simply add
+      // a mapping to that value rather than inserting a new instruction into
+      // the basic block.
+      if (Value *V = SimplifyInstruction(NewInst, TD)) {
+        // On the off-chance that this simplifies to an instruction in the old
+        // function, map it back into the new function.
+        if (Value *MappedV = VMap.lookup(V))
+          V = MappedV;
+
+        VMap[II] = V;
+        delete NewInst;
+        continue;
+      }
     }
 
-    Instruction *NewInst = II->clone();
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
-    NewBB->getInstList().push_back(NewInst);
     VMap[II] = NewInst;                // Add instruction map to value.
-    
+    NewBB->getInstList().push_back(NewInst);
     hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
       if (isa<ConstantInt>(AI->getArraySize()))
@@ -340,33 +350,6 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
       BB != &BB->getParent()->front();
   }
-  
-  if (ReturnInst *RI = dyn_cast<ReturnInst>(NewBB->getTerminator()))
-    Returns.push_back(RI);
-}
-
-/// ConstantFoldMappedInstruction - Constant fold the specified instruction,
-/// mapping its operands through VMap if they are available.
-Constant *PruningFunctionCloner::
-ConstantFoldMappedInstruction(const Instruction *I) {
-  SmallVector<Constant*, 8> Ops;
-  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-    if (Constant *Op = dyn_cast_or_null<Constant>(MapValue(I->getOperand(i),
-                                                           VMap,
-                  ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges)))
-      Ops.push_back(Op);
-    else
-      return 0;  // All operands not constant!
-
-  if (const CmpInst *CI = dyn_cast<CmpInst>(I))
-    return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
-                                           TD);
-
-  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
-    if (!LI->isVolatile())
-      return ConstantFoldLoadFromConstPtr(Ops[0], TD);
-
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD);
 }
 
 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
@@ -393,7 +376,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
 #endif
 
   PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
-                            Returns, NameSuffix, CodeInfo, TD);
+                            NameSuffix, CodeInfo, TD);
 
   // Clone the entry block, and anything recursively reachable from it.
   std::vector<const BasicBlock*> CloneWorklist;
@@ -418,25 +401,19 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
 
     // Add the new block to the new function.
     NewFunc->getBasicBlockList().push_back(NewBB);
-    
-    // Loop over all of the instructions in the block, fixing up operand
-    // references as we go.  This uses VMap to do all the hard work.
-    //
-    BasicBlock::iterator I = NewBB->begin();
 
     // Handle PHI nodes specially, as we have to remove references to dead
     // blocks.
-    if (PHINode *PN = dyn_cast<PHINode>(I)) {
-      // Skip over all PHI nodes, remembering them for later.
-      BasicBlock::const_iterator OldI = BI->begin();
-      for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI)
-        PHIToResolve.push_back(cast<PHINode>(OldI));
-    }
-    
-    // Otherwise, remap the rest of the instructions normally.
-    for (; I != NewBB->end(); ++I)
-      RemapInstruction(I, VMap,
-                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I)
+      if (const PHINode *PN = dyn_cast<PHINode>(I))
+        PHIToResolve.push_back(PN);
+      else
+        break;
+
+    // Finally, remap the terminator instructions, as those can't be remapped
+    // until all BBs are mapped.
+    RemapInstruction(NewBB->getTerminator(), VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
   }
   
   // Defer PHI resolution until rest of function is resolved, PHI resolution
@@ -518,31 +495,55 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
         ++OldI;
       }
     }
-    // NOTE: We cannot eliminate single entry phi nodes here, because of
-    // VMap.  Single entry phi nodes can have multiple VMap entries
-    // pointing at them.  Thus, deleting one would require scanning the VMap
-    // to update any entries in it that would require that.  This would be
-    // really slow.
   }
-  
+
+  // Make a second pass over the PHINodes now that all of them have been
+  // remapped into the new function, simplifying the PHINode and performing any
+  // recursive simplifications exposed. This will transparently update the
+  // WeakVH in the VMap. Notably, we rely on that so that if we coalesce
+  // two PHINodes, the iteration over the old PHIs remains valid, and the
+  // mapping will just map us to the new node (which may not even be a PHI
+  // node).
+  for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
+    if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]]))
+      recursivelySimplifyInstruction(PN, TD);
+
   // Now that the inlined function body has been fully constructed, go through
   // and zap unconditional fall-through branches.  This happen all the time when
   // specializing code: code specialization turns conditional branches into
   // uncond branches, and this code folds them.
-  Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]);
+  Function::iterator Begin = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]);
+  Function::iterator I = Begin;
   while (I != NewFunc->end()) {
+    // Check if this block has become dead during inlining or other
+    // simplifications. Note that the first block will appear dead, as it has
+    // not yet been wired up properly.
+    if (I != Begin && (pred_begin(I) == pred_end(I) ||
+                       I->getSinglePredecessor() == I)) {
+      BasicBlock *DeadBB = I++;
+      DeleteDeadBlock(DeadBB);
+      continue;
+    }
+
+    // We need to simplify conditional branches and switches with a constant
+    // operand. We try to prune these out when cloning, but if the
+    // simplification required looking through PHI nodes, those are only
+    // available after forming the full basic block. That may leave some here,
+    // and we still want to prune the dead code as early as possible.
+    ConstantFoldTerminator(I);
+
     BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
     if (!BI || BI->isConditional()) { ++I; continue; }
     
-    // Note that we can't eliminate uncond branches if the destination has
-    // single-entry PHI nodes.  Eliminating the single-entry phi nodes would
-    // require scanning the VMap to update any entries that point to the phi
-    // node.
     BasicBlock *Dest = BI->getSuccessor(0);
-    if (!Dest->getSinglePredecessor() || isa<PHINode>(Dest->begin())) {
+    if (!Dest->getSinglePredecessor()) {
       ++I; continue;
     }
-    
+
+    // We shouldn't be able to get single-entry PHI nodes here, as instsimplify
+    // above should have zapped all of them..
+    assert(!isa<PHINode>(Dest->begin()));
+
     // We know all single-entry PHI nodes in the inlined function have been
     // removed, so we just need to splice the blocks.
     BI->eraseFromParent();
@@ -558,4 +559,13 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
     
     // Do not increment I, iteratively merge all things this block branches to.
   }
+
+  // Make a final pass over the basic blocks from theh old function to gather
+  // any return instructions which survived folding. We have to do this here
+  // because we can iteratively remove and merge returns above.
+  for (Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]),
+                          E = NewFunc->end();
+       I != E; ++I)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator()))
+      Returns.push_back(RI);
 }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index b84de05..d2b167a 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -31,10 +31,12 @@
 #include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
-bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI, bool InsertLifetime) {
+bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI,
+                          bool InsertLifetime) {
   return InlineFunction(CallSite(CI), IFI, InsertLifetime);
 }
-bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, bool InsertLifetime) {
+bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI,
+                          bool InsertLifetime) {
   return InlineFunction(CallSite(II), IFI, InsertLifetime);
 }
 
@@ -434,8 +436,8 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
   return false;
 }
 
-/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to recursively
-/// update InlinedAtEntry of a DebugLoc.
+/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to
+/// recursively update InlinedAtEntry of a DebugLoc.
 static DebugLoc updateInlinedAtInfo(const DebugLoc &DL, 
                                     const DebugLoc &InlinedAtDL,
                                     LLVMContext &Ctx) {
@@ -445,7 +447,7 @@ static DebugLoc updateInlinedAtInfo(const DebugLoc &DL,
     return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx),
                          NewInlinedAtDL.getAsMDNode(Ctx));
   }
-                                             
+
   return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx),
                        InlinedAtDL.getAsMDNode(Ctx));
 }
@@ -453,7 +455,7 @@ static DebugLoc updateInlinedAtInfo(const DebugLoc &DL,
 /// fixupLineNumbers - Update inlined instructions' line numbers to 
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
-                              Instruction *TheCall) {
+                             Instruction *TheCall) {
   DebugLoc TheCallDL = TheCall->getDebugLoc();
   if (TheCallDL.isUnknown())
     return;
@@ -484,7 +486,8 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
 /// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
 /// exists in the instruction stream.  Similarly this will inline a recursive
 /// function by one level.
-bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, bool InsertLifetime) {
+bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
+                          bool InsertLifetime) {
   Instruction *TheCall = CS.getInstruction();
   assert(TheCall->getParent() && TheCall->getParent()->getParent() &&
          "Instruction not in function!");
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 5f895eb..d1c4d59 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -355,22 +355,27 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
 /// instructions in other blocks as well in this block.
 bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) {
   bool MadeChange = false;
-  for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+
+#ifndef NDEBUG
+  // In debug builds, ensure that the terminator of the block is never replaced
+  // or deleted by these simplifications. The idea of simplification is that it
+  // cannot introduce new instructions, and there is no way to replace the
+  // terminator of a block without introducing a new instruction.
+  AssertingVH<Instruction> TerminatorVH(--BB->end());
+#endif
+
+  for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) {
+    assert(!BI->isTerminator());
     Instruction *Inst = BI++;
-    
-    if (Value *V = SimplifyInstruction(Inst, TD)) {
-      WeakVH BIHandle(BI);
-      ReplaceAndSimplifyAllUses(Inst, V, TD);
+
+    WeakVH BIHandle(BI);
+    if (recursivelySimplifyInstruction(Inst, TD)) {
       MadeChange = true;
       if (BIHandle != BI)
         BI = BB->begin();
       continue;
     }
 
-    if (Inst->isTerminator())
-      break;
-
-    WeakVH BIHandle(BI);
     MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst);
     if (BIHandle != BI)
       BI = BB->begin();
@@ -408,17 +413,11 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
   WeakVH PhiIt = &BB->front();
   while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) {
     PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt));
+    Value *OldPhiIt = PhiIt;
 
-    Value *PNV = SimplifyInstruction(PN, TD);
-    if (PNV == 0) continue;
+    if (!recursivelySimplifyInstruction(PN, TD))
+      continue;
 
-    // If we're able to simplify the phi to a single value, substitute the new
-    // value into all of its uses.
-    assert(PNV != PN && "SimplifyInstruction broken!");
-    
-    Value *OldPhiIt = PhiIt;
-    ReplaceAndSimplifyAllUses(PN, PNV, TD);
-    
     // If recursive simplification ended up deleting the next PHI node we would
     // iterate to, then our iterator is invalid, restart scanning from the top
     // of the block.
@@ -763,9 +762,8 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
   unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
-  APInt Mask = APInt::getAllOnesValue(BitWidth);
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD);
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD);
   unsigned TrailZ = KnownZero.countTrailingOnes();
   
   // Avoid trouble with rediculously large TrailZ values, such as
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 512b689..e15497a 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -149,6 +149,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     return false;
   }
 
+  // Loops with indirectbr cannot be cloned.
+  if (!L->isSafeToClone()) {
+    DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
+    return false;
+  }
+
   BasicBlock *Header = L->getHeader();
   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
 
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index d53a46e..66dd2c9 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2562,7 +2562,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) {
   Value *Cond = SI->getCondition();
   unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth();
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
-  ComputeMaskedBits(Cond, APInt::getAllOnesValue(Bits), KnownZero, KnownOne);
+  ComputeMaskedBits(Cond, KnownZero, KnownOne);
 
   // Gather dead cases.
   SmallVector<ConstantInt*, 8> DeadCases;
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index e00565d..4030bef 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -46,7 +46,6 @@ namespace {
     LoopInfo         *LI;
     DominatorTree    *DT;
     ScalarEvolution  *SE;
-    IVUsers          *IU; // NULL for DisableIVRewrite
     const TargetData *TD; // May be NULL
 
     SmallVectorImpl<WeakVH> &DeadInsts;
@@ -59,7 +58,6 @@ namespace {
       L(Loop),
       LI(LPM->getAnalysisIfAvailable<LoopInfo>()),
       SE(SE),
-      IU(IVU),
       TD(LPM->getAnalysisIfAvailable<TargetData>()),
       DeadInsts(Dead),
       Changed(false) {
@@ -229,13 +227,6 @@ void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
     Rem->replaceAllUsesWith(Sel);
   }
 
-  // Inform IVUsers about the new users.
-  if (IU) {
-    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) {
-      SmallPtrSet<Loop*, 16> SimplifiedLoopNests;
-      IU->AddUsersIfInteresting(I, SimplifiedLoopNests);
-    }
-  }
   DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
   ++NumElimRem;
   Changed = true;
@@ -401,36 +392,4 @@ bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM,
   return Changed;
 }
 
-/// simplifyIVUsers - Perform simplification on instructions recorded by the
-/// IVUsers pass.
-///
-/// This is the old approach to IV simplification to be replaced by
-/// SimplifyLoopIVs.
-bool simplifyIVUsers(IVUsers *IU, ScalarEvolution *SE, LPPassManager *LPM,
-                     SmallVectorImpl<WeakVH> &Dead) {
-  SimplifyIndvar SIV(IU->getLoop(), SE, LPM, Dead);
-
-  // Each round of simplification involves a round of eliminating operations
-  // followed by a round of widening IVs. A single IVUsers worklist is used
-  // across all rounds. The inner loop advances the user. If widening exposes
-  // more uses, then another pass through the outer loop is triggered.
-  for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) {
-    Instruction *UseInst = I->getUser();
-    Value *IVOperand = I->getOperandValToReplace();
-
-    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-      SIV.eliminateIVComparison(ICmp, IVOperand);
-      continue;
-    }
-    if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-      bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-      if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-        SIV.eliminateIVRemainder(Rem, IVOperand, IsSigned);
-        continue;
-      }
-    }
-  }
-  return SIV.hasChanged();
-}
-
 } // namespace llvm
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 32eec79..9d62306 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -84,6 +84,10 @@ NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize floating-point values"));
 
 static cl::opt<bool>
+NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize pointer values"));
+
+static cl::opt<bool>
 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize casting (conversion) operations"));
 
@@ -96,6 +100,14 @@ NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
 
 static cl::opt<bool>
+NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize select instructions"));
+
+static cl::opt<bool>
+NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize getelementptr instructions"));
+
+static cl::opt<bool>
 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize loads and stores"));
 
@@ -140,10 +152,21 @@ STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
 namespace {
   struct BBVectorize : public BasicBlockPass {
     static char ID; // Pass identification, replacement for typeid
-    BBVectorize() : BasicBlockPass(ID) {
+
+    const VectorizeConfig Config;
+
+    BBVectorize(const VectorizeConfig &C = VectorizeConfig())
+      : BasicBlockPass(ID), Config(C) {
       initializeBBVectorizePass(*PassRegistry::getPassRegistry());
     }
 
+    BBVectorize(Pass *P, const VectorizeConfig &C)
+      : BasicBlockPass(ID), Config(C) {
+      AA = &P->getAnalysis<AliasAnalysis>();
+      SE = &P->getAnalysis<ScalarEvolution>();
+      TD = P->getAnalysisIfAvailable<TargetData>();
+    }
+
     typedef std::pair<Value *, Value *> ValuePair;
     typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
     typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
@@ -280,18 +303,15 @@ namespace {
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J);
 
-    virtual bool runOnBasicBlock(BasicBlock &BB) {
-      AA = &getAnalysis<AliasAnalysis>();
-      SE = &getAnalysis<ScalarEvolution>();
-      TD = getAnalysisIfAvailable<TargetData>();
-
+    bool vectorizeBB(BasicBlock &BB) {
       bool changed = false;
       // Iterate a sufficient number of times to merge types of size 1 bit,
       // then 2 bits, then 4, etc. up to half of the target vector width of the
       // target vector register.
-      for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter);
+      for (unsigned v = 2, n = 1;
+           v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter);
            v *= 2, ++n) {
-        DEBUG(dbgs() << "BBV: fusing loop #" << n << 
+        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
               " for " << BB.getName() << " in " <<
               BB.getParent()->getName() << "...\n");
         if (vectorizePairs(BB))
@@ -304,6 +324,14 @@ namespace {
       return changed;
     }
 
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      AA = &getAnalysis<AliasAnalysis>();
+      SE = &getAnalysis<ScalarEvolution>();
+      TD = getAnalysisIfAvailable<TargetData>();
+
+      return vectorizeBB(BB);
+    }
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       BasicBlockPass::getAnalysisUsage(AU);
       AU.addRequired<AliasAnalysis>();
@@ -333,7 +361,7 @@ namespace {
     // candidate chains where longer chains are considered to be better.
     // Note: when this function returns 0, the resulting instructions are
     // not actually fused.
-    static inline size_t getDepthFactor(Value *V) {
+    inline size_t getDepthFactor(Value *V) {
       // InsertElement and ExtractElement have a depth factor of zero. This is
       // for two reasons: First, they cannot be usefully fused. Second, because
       // the pass generates a lot of these, they can confuse the simple metric
@@ -347,8 +375,8 @@ namespace {
 
       // Give a load or store half of the required depth so that load/store
       // pairs will vectorize.
-      if (!NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
-        return ReqChainDepth/2;
+      if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
+        return Config.ReqChainDepth/2;
 
       return 1;
     }
@@ -421,9 +449,9 @@ namespace {
       case Intrinsic::exp:
       case Intrinsic::exp2:
       case Intrinsic::pow:
-        return !NoMath;
+        return Config.VectorizeMath;
       case Intrinsic::fma:
-        return !NoFMA;
+        return Config.VectorizeFMA;
       }
     }
 
@@ -517,24 +545,34 @@ namespace {
     } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
       // Vectorize simple loads if possbile:
       IsSimpleLoadStore = L->isSimple();
-      if (!IsSimpleLoadStore || NoMemOps)
+      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
         return false;
     } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
       // Vectorize simple stores if possbile:
       IsSimpleLoadStore = S->isSimple();
-      if (!IsSimpleLoadStore || NoMemOps)
+      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
         return false;
     } else if (CastInst *C = dyn_cast<CastInst>(I)) {
       // We can vectorize casts, but not casts of pointer types, etc.
-      if (NoCasts)
+      if (!Config.VectorizeCasts)
         return false;
 
       Type *SrcTy = C->getSrcTy();
-      if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy())
+      if (!SrcTy->isSingleValueType())
         return false;
 
       Type *DestTy = C->getDestTy();
-      if (!DestTy->isSingleValueType() || DestTy->isPointerTy())
+      if (!DestTy->isSingleValueType())
+        return false;
+    } else if (isa<SelectInst>(I)) {
+      if (!Config.VectorizeSelect)
+        return false;
+    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
+      if (!Config.VectorizeGEP)
+        return false;
+
+      // Currently, vector GEPs exist only with one index.
+      if (G->getNumIndices() != 1)
         return false;
     } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
         isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
@@ -566,14 +604,21 @@ namespace {
         !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
       return false;
 
-    if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
+    if (!Config.VectorizeInts
+        && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
+      return false;
+
+    if (!Config.VectorizeFloats
+        && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
       return false;
 
-    if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
+    if ((!Config.VectorizePointers || TD == 0) &&
+        (T1->getScalarType()->isPointerTy() ||
+         T2->getScalarType()->isPointerTy()))
       return false;
 
-    if (T1->getPrimitiveSizeInBits() > VectorBits/2 ||
-        T2->getPrimitiveSizeInBits() > VectorBits/2)
+    if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 ||
+        T2->getPrimitiveSizeInBits() > Config.VectorBits/2)
       return false;
 
     return true;
@@ -601,7 +646,7 @@ namespace {
           LI->isVolatile() != LJ->isVolatile() ||
           LI->getOrdering() != LJ->getOrdering() ||
           LI->getSynchScope() != LJ->getSynchScope())
-        return false; 
+        return false;
     } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) {
       if (SI->getValueOperand()->getType() !=
             SJ->getValueOperand()->getType() ||
@@ -622,7 +667,7 @@ namespace {
       int64_t OffsetInElmts = 0;
       if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
             OffsetInElmts) && abs64(OffsetInElmts) == 1) {
-        if (AlignedOnly) {
+        if (Config.AlignedOnly) {
           Type *aType = isa<StoreInst>(I) ?
             cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
           // An aligned load or store is possible only if the instruction
@@ -647,6 +692,20 @@ namespace {
       // FIXME: We may want to vectorize non-constant shuffles also.
     }
 
+    // The powi intrinsic is special because only the first argument is
+    // vectorized, the second arguments must be equal.
+    CallInst *CI = dyn_cast<CallInst>(I);
+    Function *FI;
+    if (CI && (FI = CI->getCalledFunction()) &&
+        FI->getIntrinsicID() == Intrinsic::powi) {
+
+      Value *A1I = CI->getArgOperand(1),
+            *A1J = cast<CallInst>(J)->getArgOperand(1);
+      const SCEV *A1ISCEV = SE->getSCEV(A1I),
+                 *A1JSCEV = SE->getSCEV(A1J);
+      return (A1ISCEV == A1JSCEV);
+    }
+
     return true;
   }
 
@@ -729,12 +788,12 @@ namespace {
       AliasSetTracker WriteSet(*AA);
       bool JAfterStart = IAfterStart;
       BasicBlock::iterator J = llvm::next(I);
-      for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) {
+      for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
         if (J == Start) JAfterStart = true;
 
         // Determine if J uses I, if so, exit the loop.
-        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep);
-        if (FastDep) {
+        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
+        if (Config.FastDep) {
           // Note: For this heuristic to be effective, independent operations
           // must tend to be intermixed. This is likely to be true from some
           // kinds of grouped loop unrolling (but not the generic LLVM pass),
@@ -772,7 +831,7 @@ namespace {
         // If we have already found too many pairs, break here and this function
         // will be called again starting after the last instruction selected
         // during this invocation.
-        if (PairableInsts.size() >= MaxInsts) {
+        if (PairableInsts.size() >= Config.MaxInsts) {
           ShouldContinue = true;
           break;
         }
@@ -796,16 +855,33 @@ namespace {
                       std::vector<Value *> &PairableInsts,
                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
                       ValuePair P) {
+    StoreInst *SI, *SJ;
+
     // For each possible pairing for this variable, look at the uses of
     // the first value...
     for (Value::use_iterator I = P.first->use_begin(),
          E = P.first->use_end(); I != E; ++I) {
+      if (isa<LoadInst>(*I)) {
+        // A pair cannot be connected to a load because the load only takes one
+        // operand (the address) and it is a scalar even after vectorization.
+        continue;
+      } else if ((SI = dyn_cast<StoreInst>(*I)) &&
+                 P.first == SI->getPointerOperand()) {
+        // Similarly, a pair cannot be connected to a store through its
+        // pointer operand.
+        continue;
+      }
+
       VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
 
       // For each use of the first variable, look for uses of the second
       // variable...
       for (Value::use_iterator J = P.second->use_begin(),
            E2 = P.second->use_end(); J != E2; ++J) {
+        if ((SJ = dyn_cast<StoreInst>(*J)) &&
+            P.second == SJ->getPointerOperand())
+          continue;
+
         VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
 
         // Look for <I, J>:
@@ -817,23 +893,37 @@ namespace {
           ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
       }
 
-      if (SplatBreaksChain) continue;
+      if (Config.SplatBreaksChain) continue;
       // Look for cases where just the first value in the pair is used by
       // both members of another pair (splatting).
       for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
+        if ((SJ = dyn_cast<StoreInst>(*J)) &&
+            P.first == SJ->getPointerOperand())
+          continue;
+
         if (isSecondInIteratorPair<Value*>(*J, IPairRange))
           ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
       }
     }
 
-    if (SplatBreaksChain) return;
+    if (Config.SplatBreaksChain) return;
     // Look for cases where just the second value in the pair is used by
     // both members of another pair (splatting).
     for (Value::use_iterator I = P.second->use_begin(),
          E = P.second->use_end(); I != E; ++I) {
+      if (isa<LoadInst>(*I))
+        continue;
+      else if ((SI = dyn_cast<StoreInst>(*I)) &&
+               P.second == SI->getPointerOperand())
+        continue;
+
       VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
 
       for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
+        if ((SJ = dyn_cast<StoreInst>(*J)) &&
+            P.second == SJ->getPointerOperand())
+          continue;
+
         if (isSecondInIteratorPair<Value*>(*J, IPairRange))
           ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
       }
@@ -1256,7 +1346,7 @@ namespace {
              << *J->first << " <-> " << *J->second << "} of depth " <<
              MaxDepth << " and size " << PrunedTree.size() <<
             " (effective size: " << EffSize << ")\n");
-      if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) {
+      if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) {
         BestMaxDepth = MaxDepth;
         BestEffSize = EffSize;
         BestTree = PrunedTree;
@@ -1272,7 +1362,8 @@ namespace {
                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
                       DenseSet<ValuePair> &PairableInstUsers,
                       DenseMap<Value *, Value *>& ChosenPairs) {
-    bool UseCycleCheck = CandidatePairs.size() <= MaxCandPairsForCycleCheck;
+    bool UseCycleCheck =
+     CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck;
     std::multimap<ValuePair, ValuePair> PairableInstUserMap;
     for (std::vector<Value *>::iterator I = PairableInsts.begin(),
          E = PairableInsts.end(); I != E; ++I) {
@@ -1518,19 +1609,27 @@ namespace {
         ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o,
                                 FlipMemInputs);
         continue;
-      } else if (isa<CallInst>(I) && o == NumOperands-1) {
+      } else if (isa<CallInst>(I)) {
         Function *F = cast<CallInst>(I)->getCalledFunction();
         unsigned IID = F->getIntrinsicID();
-        BasicBlock &BB = *I->getParent();
+        if (o == NumOperands-1) {
+          BasicBlock &BB = *I->getParent();
 
-        Module *M = BB.getParent()->getParent();
-        Type *ArgType = I->getType();
-        Type *VArgType = getVecTypeForPair(ArgType);
+          Module *M = BB.getParent()->getParent();
+          Type *ArgType = I->getType();
+          Type *VArgType = getVecTypeForPair(ArgType);
 
-        // FIXME: is it safe to do this here?
-        ReplacedOperands[o] = Intrinsic::getDeclaration(M,
-          (Intrinsic::ID) IID, VArgType);
-        continue;
+          // FIXME: is it safe to do this here?
+          ReplacedOperands[o] = Intrinsic::getDeclaration(M,
+            (Intrinsic::ID) IID, VArgType);
+          continue;
+        } else if (IID == Intrinsic::powi && o == 1) {
+          // The second argument of powi is a single integer and we've already
+          // checked that both arguments are equal. As a result, we just keep
+          // I's second argument.
+          ReplacedOperands[o] = I->getOperand(o);
+          continue;
+        }
       } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
         ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
         continue;
@@ -1835,7 +1934,35 @@ INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
 
-BasicBlockPass *llvm::createBBVectorizePass() {
-  return new BBVectorize();
+BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
+  return new BBVectorize(C);
+}
+
+bool
+llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
+  BBVectorize BBVectorizer(P, C);
+  return BBVectorizer.vectorizeBB(BB);
 }
 
+//===----------------------------------------------------------------------===//
+VectorizeConfig::VectorizeConfig() {
+  VectorBits = ::VectorBits;
+  VectorizeInts = !::NoInts;
+  VectorizeFloats = !::NoFloats;
+  VectorizePointers = !::NoPointers;
+  VectorizeCasts = !::NoCasts;
+  VectorizeMath = !::NoMath;
+  VectorizeFMA = !::NoFMA;
+  VectorizeSelect = !::NoSelect;
+  VectorizeGEP = !::NoGEP;
+  VectorizeMemOps = !::NoMemOps;
+  AlignedOnly = ::AlignedOnly;
+  ReqChainDepth= ::ReqChainDepth;
+  SearchLimit = ::SearchLimit;
+  MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
+  SplatBreaksChain = ::SplatBreaksChain;
+  MaxInsts = ::MaxInsts;
+  MaxIter = ::MaxIter;
+  NoMemOpBoost = ::NoMemOpBoost;
+  FastDep = ::FastDep;
+}
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index ea3d4ba..2e16372 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -18,9 +18,6 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -59,7 +56,8 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     if (Name.startswith("x86.sse2.pcmpeq.") ||
         Name.startswith("x86.sse2.pcmpgt.") ||
         Name.startswith("x86.avx2.pcmpeq.") ||
-        Name.startswith("x86.avx2.pcmpgt.")) {
+        Name.startswith("x86.avx2.pcmpgt.") ||
+        Name.startswith("x86.avx.vpermil.")) {
       NewFn = 0;
       return true;
     }
@@ -121,7 +119,42 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       // need to sign extend since icmp returns vector of i1
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
     } else {
-      llvm_unreachable("Unknown function for CallInst upgrade.");
+      bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
+      if (Name.startswith("llvm.x86.avx.vpermil.pd.256"))
+        PD256 = true;
+      else if (Name.startswith("llvm.x86.avx.vpermil.pd"))
+        PD128 = true;
+      else if (Name.startswith("llvm.x86.avx.vpermil.ps.256"))
+        PS256 = true;
+      else if (Name.startswith("llvm.x86.avx.vpermil.ps"))
+        PS128 = true;
+
+      if (PD256 || PD128 || PS256 || PS128) {
+        Value *Op0 = CI->getArgOperand(0);
+        unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+        SmallVector<Constant*, 8> Idxs;
+
+        if (PD128)
+          for (unsigned i = 0; i != 2; ++i)
+            Idxs.push_back(Builder.getInt32((Imm >> i) & 0x1));
+        else if (PD256)
+          for (unsigned l = 0; l != 4; l+=2)
+            for (unsigned i = 0; i != 2; ++i)
+              Idxs.push_back(Builder.getInt32(((Imm >> (l+i)) & 0x1) + l));
+        else if (PS128)
+          for (unsigned i = 0; i != 4; ++i)
+            Idxs.push_back(Builder.getInt32((Imm >> (2 * i)) & 0x3));
+        else if (PS256)
+          for (unsigned l = 0; l != 8; l+=4)
+            for (unsigned i = 0; i != 4; ++i)
+              Idxs.push_back(Builder.getInt32(((Imm >> (2 * i)) & 0x3) + l));
+        else
+          llvm_unreachable("Unexpected function");
+
+        Rep = Builder.CreateShuffleVector(Op0, Op0, ConstantVector::get(Idxs));
+      } else {
+        llvm_unreachable("Unknown function for CallInst upgrade.");
+      }
     }
 
     CI->replaceAllUsesWith(Rep);
diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt
index 99eeba1..e1efcda 100644
--- a/lib/VMCore/CMakeLists.txt
+++ b/lib/VMCore/CMakeLists.txt
@@ -8,7 +8,6 @@ add_llvm_library(LLVMCore
   ConstantFold.cpp
   Constants.cpp
   Core.cpp
-  DebugInfoProbe.cpp
   DebugLoc.cpp
   Dominators.cpp
   Function.cpp
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index e86d805..a9cca22 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -2066,6 +2066,20 @@ LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str,
   return wrap(unwrap(B)->CreateGlobalStringPtr(Str, Name));
 }
 
+LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) {
+  Value *P = unwrap<Value>(MemAccessInst);
+  if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    return LI->isVolatile();
+  return cast<StoreInst>(P)->isVolatile();
+}
+
+void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) {
+  Value *P = unwrap<Value>(MemAccessInst);
+  if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    return LI->setVolatile(isVolatile);
+  return cast<StoreInst>(P)->setVolatile(isVolatile);
+}
+
 /*--.. Casts ...............................................................--*/
 
 LLVMValueRef LLVMBuildTrunc(LLVMBuilderRef B, LLVMValueRef Val,
diff --git a/lib/VMCore/DebugInfoProbe.cpp b/lib/VMCore/DebugInfoProbe.cpp
deleted file mode 100644
index d1275ff..0000000
--- a/lib/VMCore/DebugInfoProbe.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-//===-- DebugInfoProbe.cpp - DebugInfo Probe ------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements DebugInfoProbe. This probe can be used by a pass
-// manager to analyze how optimizer is treating debugging information.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "debuginfoprobe"
-#include "llvm/DebugInfoProbe.h"
-#include "llvm/Function.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Metadata.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DebugLoc.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/StringRef.h"
-#include <set>
-#include <string>
-
-using namespace llvm;
-
-static cl::opt<bool>
-EnableDebugInfoProbe("enable-debug-info-probe", cl::Hidden,
-                     cl::desc("Enable debug info probe"));
-
-// CreateInfoOutputFile - Return a file stream to print our output on.
-namespace llvm { extern raw_ostream *CreateInfoOutputFile(); }
-
-//===----------------------------------------------------------------------===//
-// DebugInfoProbeImpl - This class implements a interface to monitor
-// how an optimization pass is preserving debugging information.
-
-namespace llvm {
-
-  class DebugInfoProbeImpl {
-  public:
-    DebugInfoProbeImpl() : NumDbgLineLost(0),NumDbgValueLost(0) {}
-    void initialize(StringRef PName, Function &F);
-    void finalize(Function &F);
-    void report();
-  private:
-    unsigned NumDbgLineLost, NumDbgValueLost;
-    std::string PassName;
-    Function *TheFn;
-    std::set<MDNode *> DbgVariables;
-    std::set<Instruction *> MissingDebugLoc;
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// DebugInfoProbeImpl
-
-/// initialize - Collect information before running an optimization pass.
-void DebugInfoProbeImpl::initialize(StringRef PName, Function &F) {
-  if (!EnableDebugInfoProbe) return;
-  PassName = PName;
-
-  DbgVariables.clear();
-  MissingDebugLoc.clear();
-  TheFn = &F;
-
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
-         BI != BE; ++BI) {
-      if (!isa<PHINode>(BI) && BI->getDebugLoc().isUnknown())
-        MissingDebugLoc.insert(BI);
-      if (!isa<DbgInfoIntrinsic>(BI)) continue;
-      Value *Addr = NULL;
-      MDNode *Node = NULL;
-      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(BI)) {
-        Addr = DDI->getAddress();
-        Node = DDI->getVariable();
-      } else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) {
-        Addr = DVI->getValue();
-        Node = DVI->getVariable();
-      }
-      if (Addr)
-        DbgVariables.insert(Node);
-    }
-}
-
-/// report - Report findings. This should be invoked after finalize.
-void DebugInfoProbeImpl::report() {
-  if (!EnableDebugInfoProbe) return;
-  if (NumDbgLineLost || NumDbgValueLost) {
-    raw_ostream *OutStream = CreateInfoOutputFile();
-    if (NumDbgLineLost)
-      *OutStream << NumDbgLineLost
-                 << "\t times line number info lost by "
-                 << PassName << "\n";
-    if (NumDbgValueLost)
-      *OutStream << NumDbgValueLost
-                 << "\t times variable info lost by    "
-                 << PassName << "\n";
-    delete OutStream;
-  }
-  NumDbgLineLost = 0;
-  NumDbgValueLost = 0;
-}
-
-/// finalize - Collect information after running an optimization pass. This
-/// must be used after initialization.
-void DebugInfoProbeImpl::finalize(Function &F) {
-  if (!EnableDebugInfoProbe) return;
-  assert (TheFn == &F && "Invalid function to measure!");
-
-  std::set<MDNode *>DbgVariables2;
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
-         BI != BE; ++BI) {
-      if (!isa<PHINode>(BI) && BI->getDebugLoc().isUnknown() &&
-          MissingDebugLoc.count(BI) == 0) {
-        ++NumDbgLineLost;
-        DEBUG(dbgs() << "DebugInfoProbe (" << PassName << "): --- ");
-        DEBUG(BI->print(dbgs()));
-        DEBUG(dbgs() << "\n");
-      }
-      if (!isa<DbgInfoIntrinsic>(BI)) continue;
-      Value *Addr = NULL;
-      MDNode *Node = NULL;
-      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(BI)) {
-        Addr = DDI->getAddress();
-        Node = DDI->getVariable();
-      } else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) {
-        Addr = DVI->getValue();
-        Node = DVI->getVariable();
-      }
-      if (Addr)
-        DbgVariables2.insert(Node);
-    }
-
-  for (std::set<MDNode *>::iterator I = DbgVariables.begin(), 
-         E = DbgVariables.end(); I != E; ++I) {
-    if (DbgVariables2.count(*I) == 0 && (*I)->getNumOperands() >= 2) {
-      DEBUG(dbgs() 
-            << "DebugInfoProbe("
-            << PassName
-            << "): Losing dbg info for variable: ";
-            if (MDString *MDS = dyn_cast_or_null<MDString>(
-                (*I)->getOperand(2)))
-              dbgs() << MDS->getString();
-            else
-              dbgs() << "...";
-            dbgs() << "\n");
-      ++NumDbgValueLost;
-    }
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// DebugInfoProbe
-
-DebugInfoProbe::DebugInfoProbe() {
-  pImpl = new DebugInfoProbeImpl();
-}
-
-DebugInfoProbe::~DebugInfoProbe() {
-  delete pImpl;
-}
-
-/// initialize - Collect information before running an optimization pass.
-void DebugInfoProbe::initialize(StringRef PName, Function &F) {
-  pImpl->initialize(PName, F);
-}
-
-/// finalize - Collect information after running an optimization pass. This
-/// must be used after initialization.
-void DebugInfoProbe::finalize(Function &F) {
-  pImpl->finalize(F);
-}
-
-/// report - Report findings. This should be invoked after finalize.
-void DebugInfoProbe::report() {
-  pImpl->report();
-}
-
-//===----------------------------------------------------------------------===//
-// DebugInfoProbeInfo
-
-/// ~DebugInfoProbeInfo - Report data collected by all probes before deleting
-/// them.
-DebugInfoProbeInfo::~DebugInfoProbeInfo() {
-  if (!EnableDebugInfoProbe) return;
-    for (StringMap<DebugInfoProbe*>::iterator I = Probes.begin(),
-           E = Probes.end(); I != E; ++I) {
-      I->second->report();
-      delete I->second;
-    }
-  }
-
-/// initialize - Collect information before running an optimization pass.
-void DebugInfoProbeInfo::initialize(Pass *P, Function &F) {
-  if (!EnableDebugInfoProbe) return;
-  if (P->getAsPMDataManager())
-    return;
-
-  StringMapEntry<DebugInfoProbe *> &Entry =
-    Probes.GetOrCreateValue(P->getPassName());
-  DebugInfoProbe *&Probe = Entry.getValue();
-  if (!Probe)
-    Probe = new DebugInfoProbe();
-  Probe->initialize(P->getPassName(), F);
-}
-
-/// finalize - Collect information after running an optimization pass. This
-/// must be used after initialization.
-void DebugInfoProbeInfo::finalize(Pass *P, Function &F) {
-  if (!EnableDebugInfoProbe) return;
-  if (P->getAsPMDataManager())
-    return;
-  StringMapEntry<DebugInfoProbe *> &Entry =
-    Probes.GetOrCreateValue(P->getPassName());
-  DebugInfoProbe *&Probe = Entry.getValue();
-  assert (Probe && "DebugInfoProbe is not initialized!");
-  Probe->finalize(F);
-}
diff --git a/lib/VMCore/DebugLoc.cpp b/lib/VMCore/DebugLoc.cpp
index 328244f..9013d28 100644
--- a/lib/VMCore/DebugLoc.cpp
+++ b/lib/VMCore/DebugLoc.cpp
@@ -173,10 +173,7 @@ DebugLoc DenseMapInfo<DebugLoc>::getTombstoneKey() {
 }
 
 unsigned DenseMapInfo<DebugLoc>::getHashValue(const DebugLoc &Key) {
-  FoldingSetNodeID ID;
-  ID.AddInteger(Key.LineCol);
-  ID.AddInteger(Key.ScopeIdx);
-  return ID.ComputeHash();
+  return static_cast<unsigned>(hash_combine(Key.LineCol, Key.ScopeIdx));
 }
 
 bool DenseMapInfo<DebugLoc>::isEqual(const DebugLoc &LHS, const DebugLoc &RHS) {
diff --git a/lib/VMCore/Dominators.cpp b/lib/VMCore/Dominators.cpp
index af51a05..219e631 100644
--- a/lib/VMCore/Dominators.cpp
+++ b/lib/VMCore/Dominators.cpp
@@ -88,8 +88,13 @@ bool DominatorTree::dominates(const Instruction *Def,
   const BasicBlock *UseBB = User->getParent();
   const BasicBlock *DefBB = Def->getParent();
 
-  assert(isReachableFromEntry(DefBB) && isReachableFromEntry(UseBB) &&
-         "We only handle reachable blocks");
+  // Any unreachable use is dominated, even if Def == User.
+  if (!isReachableFromEntry(UseBB))
+    return true;
+
+  // Unreachable definitions don't dominate anything.
+  if (!isReachableFromEntry(DefBB))
+    return false;
 
   // An instruction doesn't dominate a use in itself.
   if (Def == User)
@@ -119,8 +124,13 @@ bool DominatorTree::dominates(const Instruction *Def,
                               const BasicBlock *UseBB) const {
   const BasicBlock *DefBB = Def->getParent();
 
-  assert(isReachableFromEntry(DefBB) && isReachableFromEntry(UseBB) &&
-         "We only handle reachable blocks");
+  // Any unreachable use is dominated, even if DefBB == UseBB.
+  if (!isReachableFromEntry(UseBB))
+    return true;
+
+  // Unreachable definitions don't dominate anything.
+  if (!isReachableFromEntry(DefBB))
+    return false;
 
   if (DefBB == UseBB)
     return false;
@@ -174,3 +184,83 @@ bool DominatorTree::dominates(const Instruction *Def,
   }
   return true;
 }
+
+bool DominatorTree::dominates(const Instruction *Def,
+                              const Use &U) const {
+  Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+
+  // Instructions do not dominate non-instructions.
+  if (!UserInst)
+    return false;
+
+  const BasicBlock *DefBB = Def->getParent();
+
+  // Determine the block in which the use happens. PHI nodes use
+  // their operands on edges; simulate this by thinking of the use
+  // happening at the end of the predecessor block.
+  const BasicBlock *UseBB;
+  if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+    UseBB = PN->getIncomingBlock(U);
+  else
+    UseBB = UserInst->getParent();
+
+  // Any unreachable use is dominated, even if Def == User.
+  if (!isReachableFromEntry(UseBB))
+    return true;
+
+  // Unreachable definitions don't dominate anything.
+  if (!isReachableFromEntry(DefBB))
+    return false;
+
+  // Invoke instructions define their return values on the edges
+  // to their normal successors, so we have to handle them specially.
+  // Among other things, this means they don't dominate anything in
+  // their own block, except possibly a phi, so we don't need to
+  // walk the block in any case.
+  if (const InvokeInst *II = dyn_cast<InvokeInst>(Def)) {
+    // A PHI in the normal successor using the invoke's return value is
+    // dominated by the invoke's return value.
+    if (isa<PHINode>(UserInst) &&
+        UserInst->getParent() == II->getNormalDest() &&
+        cast<PHINode>(UserInst)->getIncomingBlock(U) == DefBB)
+      return true;
+
+    // Otherwise use the instruction-dominates-block query, which
+    // handles the crazy case of an invoke with a critical edge
+    // properly.
+    return dominates(Def, UseBB);
+  }
+
+  // If the def and use are in different blocks, do a simple CFG dominator
+  // tree query.
+  if (DefBB != UseBB)
+    return dominates(DefBB, UseBB);
+
+  // Ok, def and use are in the same block. If the def is an invoke, it
+  // doesn't dominate anything in the block. If it's a PHI, it dominates
+  // everything in the block.
+  if (isa<PHINode>(UserInst))
+    return true;
+
+  // Otherwise, just loop through the basic block until we find Def or User.
+  BasicBlock::const_iterator I = DefBB->begin();
+  for (; &*I != Def && &*I != UserInst; ++I)
+    /*empty*/;
+
+  return &*I != UserInst;
+}
+
+bool DominatorTree::isReachableFromEntry(const Use &U) const {
+  Instruction *I = dyn_cast<Instruction>(U.getUser());
+
+  // ConstantExprs aren't really reachable from the entry block, but they
+  // don't need to be treated like unreachable code either.
+  if (!I) return true;
+
+  // PHI nodes use their operands on their incoming edges.
+  if (PHINode *PN = dyn_cast<PHINode>(I))
+    return isReachableFromEntry(PN->getIncomingBlock(U));
+
+  // Everything else uses their operands in their own block.
+  return isReachableFromEntry(I->getParent());
+}
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 8db6ac9..6c5db32 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -2003,6 +2003,23 @@ bool BinaryOperator::isExact() const {
 }
 
 //===----------------------------------------------------------------------===//
+//                             FPMathOperator Class
+//===----------------------------------------------------------------------===//
+
+/// getFPAccuracy - Get the maximum error permitted by this operation in ULPs.
+/// An accuracy of 0.0 means that the operation should be performed with the
+/// default precision.
+float FPMathOperator::getFPAccuracy() const {
+  const MDNode *MD =
+    cast<Instruction>(this)->getMetadata(LLVMContext::MD_fpmath);
+  if (!MD)
+    return 0.0;
+  ConstantFP *Accuracy = cast<ConstantFP>(MD->getOperand(0));
+  return Accuracy->getValueAPF().convertToFloat();
+}
+
+
+//===----------------------------------------------------------------------===//
 //                                CastInst Class
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/VMCore/LLVMContext.cpp b/lib/VMCore/LLVMContext.cpp
index d77e996..f07f0b3 100644
--- a/lib/VMCore/LLVMContext.cpp
+++ b/lib/VMCore/LLVMContext.cpp
@@ -44,10 +44,15 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   unsigned ProfID = getMDKindID("prof");
   assert(ProfID == MD_prof && "prof kind id drifted"); (void)ProfID;
 
-  // Create the 'fpaccuracy' metadata kind.
-  unsigned FPAccuracyID = getMDKindID("fpaccuracy");
-  assert(FPAccuracyID == MD_fpaccuracy && "fpaccuracy kind id drifted");
+  // Create the 'fpmath' metadata kind.
+  unsigned FPAccuracyID = getMDKindID("fpmath");
+  assert(FPAccuracyID == MD_fpmath && "fpmath kind id drifted");
   (void)FPAccuracyID;
+
+  // Create the 'range' metadata kind.
+  unsigned RangeID = getMDKindID("range");
+  assert(RangeID == MD_range && "range kind id drifted");
+  (void)RangeID;
 }
 LLVMContext::~LLVMContext() { delete pImpl; }
 
diff --git a/lib/VMCore/LLVMContextImpl.h b/lib/VMCore/LLVMContextImpl.h
index f98526d..2252028 100644
--- a/lib/VMCore/LLVMContextImpl.h
+++ b/lib/VMCore/LLVMContextImpl.h
@@ -194,6 +194,26 @@ struct FunctionTypeKeyInfo {
   }
 };
 
+// Provide a FoldingSetTrait::Equals specialization for MDNode that can use a
+// shortcut to avoid comparing all operands.
+template<> struct FoldingSetTrait<MDNode> : DefaultFoldingSetTrait<MDNode> {
+  static bool Equals(const MDNode &X, const FoldingSetNodeID &ID,
+                     unsigned IDHash, FoldingSetNodeID &TempID) {
+    assert(!X.isNotUniqued() && "Non-uniqued MDNode in FoldingSet?");
+    // First, check if the cached hashes match.  If they don't we can skip the
+    // expensive operand walk.
+    if (X.Hash != IDHash)
+      return false;
+
+    // If they match we have to compare the operands.
+    X.Profile(TempID);
+    return TempID == ID;
+  }
+  static unsigned ComputeHash(const MDNode &X, FoldingSetNodeID &) {
+    return X.Hash; // Return cached hash.
+  }
+};
+
 /// DebugRecVH - This is a CallbackVH used to keep the Scope -> index maps
 /// up to date as MDNodes mutate.  This class is implemented in DebugLoc.cpp.
 class DebugRecVH : public CallbackVH {
@@ -234,7 +254,7 @@ public:
                          DenseMapAPFloatKeyInfo> FPMapTy;
   FPMapTy FPConstants;
   
-  StringMap<MDString*> MDStringCache;
+  StringMap<Value*> MDStringCache;
   
   FoldingSet<MDNode> MDNodeSet;
   // MDNodes may be uniqued or not uniqued.  When they're not uniqued, they
diff --git a/lib/VMCore/Metadata.cpp b/lib/VMCore/Metadata.cpp
index 0fc2a25..090b09a 100644
--- a/lib/VMCore/Metadata.cpp
+++ b/lib/VMCore/Metadata.cpp
@@ -31,16 +31,17 @@ using namespace llvm;
 
 void MDString::anchor() { }
 
-MDString::MDString(LLVMContext &C, StringRef S)
-  : Value(Type::getMetadataTy(C), Value::MDStringVal), Str(S) {}
+MDString::MDString(LLVMContext &C)
+  : Value(Type::getMetadataTy(C), Value::MDStringVal) {}
 
 MDString *MDString::get(LLVMContext &Context, StringRef Str) {
   LLVMContextImpl *pImpl = Context.pImpl;
-  StringMapEntry<MDString *> &Entry =
+  StringMapEntry<Value*> &Entry =
     pImpl->MDStringCache.GetOrCreateValue(Str);
-  MDString *&S = Entry.getValue();
-  if (!S) S = new MDString(Context, Entry.getKey());
-  return S;
+  Value *&S = Entry.getValue();
+  if (!S) S = new MDString(Context);
+  S->setValueName(&Entry);
+  return cast<MDString>(S);
 }
 
 //===----------------------------------------------------------------------===//
@@ -50,14 +51,26 @@ MDString *MDString::get(LLVMContext &Context, StringRef Str) {
 // Use CallbackVH to hold MDNode operands.
 namespace llvm {
 class MDNodeOperand : public CallbackVH {
-  MDNode *Parent;
+  MDNode *getParent() {
+    MDNodeOperand *Cur = this;
+
+    while (Cur->getValPtrInt() != 1)
+      --Cur;
+
+    assert(Cur->getValPtrInt() == 1 &&
+           "Couldn't find the beginning of the operand list!");
+    return reinterpret_cast<MDNode*>(Cur) - 1;
+  }
+
 public:
-  MDNodeOperand(Value *V, MDNode *P) : CallbackVH(V), Parent(P) {}
+  MDNodeOperand(Value *V) : CallbackVH(V) {}
   ~MDNodeOperand() {}
 
-  void set(Value *V) {
-    setValPtr(V);
-  }
+  void set(Value *V) { this->setValPtr(V); }
+
+  /// setAsFirstOperand - Accessor method to mark the operand as the first in
+  /// the list.
+  void setAsFirstOperand(unsigned V) { this->setValPtrInt(V); }
 
   virtual void deleted();
   virtual void allUsesReplacedWith(Value *NV);
@@ -66,15 +79,13 @@ public:
 
 
 void MDNodeOperand::deleted() {
-  Parent->replaceOperand(this, 0);
+  getParent()->replaceOperand(this, 0);
 }
 
 void MDNodeOperand::allUsesReplacedWith(Value *NV) {
-  Parent->replaceOperand(this, NV);
+  getParent()->replaceOperand(this, NV);
 }
 
-
-
 //===----------------------------------------------------------------------===//
 // MDNode implementation.
 //
@@ -102,8 +113,13 @@ MDNode::MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal)
   // Initialize the operand list, which is co-allocated on the end of the node.
   unsigned i = 0;
   for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op+NumOperands;
-       Op != E; ++Op, ++i)
-    new (Op) MDNodeOperand(Vals[i], this);
+       Op != E; ++Op, ++i) {
+    new (Op) MDNodeOperand(Vals[i]);
+
+    // Mark the first MDNodeOperand as being the first in the list of operands.
+    if (i == 0)
+      Op->setAsFirstOperand(1);
+  }
 }
 
 
@@ -205,11 +221,11 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals,
     ID.AddPointer(Vals[i]);
 
   void *InsertPoint;
-  MDNode *N = NULL;
-  
-  if ((N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint)))
+  MDNode *N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
+
+  if (N || !Insert)
     return N;
-    
+
   bool isFunctionLocal = false;
   switch (FL) {
   case FL_Unknown:
@@ -234,6 +250,9 @@ MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals,
   void *Ptr = malloc(sizeof(MDNode)+Vals.size()*sizeof(MDNodeOperand));
   N = new (Ptr) MDNode(Context, Vals, isFunctionLocal);
 
+  // Cache the operand hash.
+  N->Hash = ID.ComputeHash();
+
   // InsertPoint will have been set by the FindNodeOrInsertPos call.
   pImpl->MDNodeSet.InsertNode(N, InsertPoint);
 
@@ -357,6 +376,8 @@ void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
     return;
   }
 
+  // Cache the operand hash.
+  Hash = ID.ComputeHash();
   // InsertPoint will have been set by the FindNodeOrInsertPos call.
   pImpl->MDNodeSet.InsertNode(this, InsertPoint);
 
@@ -551,17 +572,15 @@ getAllMetadataOtherThanDebugLocImpl(SmallVectorImpl<std::pair<unsigned,
          getContext().pImpl->MetadataStore.count(this) &&
          "Shouldn't have called this");
   const LLVMContextImpl::MDMapTy &Info =
-  getContext().pImpl->MetadataStore.find(this)->second;
+    getContext().pImpl->MetadataStore.find(this)->second;
   assert(!Info.empty() && "Shouldn't have called this");
-  
   Result.append(Info.begin(), Info.end());
-  
+
   // Sort the resulting array so it is stable.
   if (Result.size() > 1)
     array_pod_sort(Result.begin(), Result.end());
 }
 
-
 /// clearMetadataHashEntries - Clear all hashtable-based metadata from
 /// this instruction.
 void Instruction::clearMetadataHashEntries() {
diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp
index e8bc6db..3c67191 100644
--- a/lib/VMCore/Module.cpp
+++ b/lib/VMCore/Module.cpp
@@ -434,7 +434,7 @@ bool Module::MaterializeAllPermanently(std::string *ErrInfo) {
 //
 
 
-// dropAllReferences() - This function causes all the subelementss to "let go"
+// dropAllReferences() - This function causes all the subelements to "let go"
 // of all references that they are maintaining.  This allows one to 'delete' a
 // whole module at a time, even though there may be circular references... first
 // all references are dropped, and all use counts go to zero.  Then everything
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index 773862d..28fbaa6 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/PassManagers.h"
 #include "llvm/PassManager.h"
-#include "llvm/DebugInfoProbe.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CommandLine.h"
@@ -26,7 +25,6 @@
 #include "llvm/Support/PassNameParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Mutex.h"
-#include "llvm/ADT/StringMap.h"
 #include <algorithm>
 #include <map>
 using namespace llvm;
@@ -422,20 +420,6 @@ char PassManagerImpl::ID = 0;
 namespace {
 
 //===----------------------------------------------------------------------===//
-// DebugInfoProbe
-
-static DebugInfoProbeInfo *TheDebugProbe;
-static void createDebugInfoProbe() {
-  if (TheDebugProbe) return;
-
-  // Constructed the first time this is called. This guarantees that the
-  // object will be constructed, if -enable-debug-info-probe is set,
-  // before static globals, thus it will be destroyed before them.
-  static ManagedStatic<DebugInfoProbeInfo> DIP;
-  TheDebugProbe = &*DIP;
-}
-
-//===----------------------------------------------------------------------===//
 /// TimingInfo Class - This class is used to calculate information about the
 /// amount of time each pass takes to execute.  This only happens when
 /// -time-passes is enabled on the command line.
@@ -1440,7 +1424,6 @@ void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
 bool FunctionPassManagerImpl::run(Function &F) {
   bool Changed = false;
   TimingInfo::createTheTimeInfo();
-  createDebugInfoProbe();
 
   initializeAllAnalysisInfo();
   for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
@@ -1488,16 +1471,13 @@ bool FPPassManager::runOnFunction(Function &F) {
     dumpRequiredSet(FP);
 
     initializeAnalysisImpl(FP);
-    if (TheDebugProbe)
-      TheDebugProbe->initialize(FP, F);
+
     {
       PassManagerPrettyStackEntry X(FP, F);
       TimeRegion PassTimer(getPassTimer(FP));
 
       LocalChanged |= FP->runOnFunction(F);
     }
-    if (TheDebugProbe)
-      TheDebugProbe->finalize(FP, F);
 
     Changed |= LocalChanged;
     if (LocalChanged)
@@ -1647,7 +1627,6 @@ Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
 bool PassManagerImpl::run(Module &M) {
   bool Changed = false;
   TimingInfo::createTheTimeInfo();
-  createDebugInfoProbe();
 
   dumpArguments();
   dumpPasses();
diff --git a/lib/VMCore/Use.cpp b/lib/VMCore/Use.cpp
index 359a151..0128adc 100644
--- a/lib/VMCore/Use.cpp
+++ b/lib/VMCore/Use.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Value.h"
+#include <new>
 
 namespace llvm {
 
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index 41cc38c..4006b2c 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -76,7 +76,7 @@ Value::~Value() {
 
   // If this value is named, destroy the name.  This should not be in a symtab
   // at this point.
-  if (Name)
+  if (Name && SubclassID != MDStringVal)
     Name->Destroy();
 
   // There should be no uses of this object anymore, remove it.
@@ -170,6 +170,9 @@ StringRef Value::getName() const {
 }
 
 void Value::setName(const Twine &NewName) {
+  assert(SubclassID != MDStringVal &&
+         "Cannot set the name of MDString with this method!");
+
   // Fast path for common IRBuilder case of setName("") when there is no name.
   if (NewName.isTriviallyEmpty() && !hasName())
     return;
@@ -228,6 +231,8 @@ void Value::setName(const Twine &NewName) {
 /// takeName - transfer the name from V to this value, setting V's name to
 /// empty.  It is an error to call V->takeName(V).
 void Value::takeName(Value *V) {
+  assert(SubclassID != MDStringVal && "Cannot take the name of an MDString!");
+
   ValueSymbolTable *ST = 0;
   // If this value has a name, drop it.
   if (hasName()) {
@@ -477,7 +482,7 @@ void ValueHandleBase::AddToExistingUseList(ValueHandleBase **List) {
   setPrevPtr(List);
   if (Next) {
     Next->setPrevPtr(&Next);
-    assert(VP == Next->VP && "Added to wrong list?");
+    assert(VP.getPointer() == Next->VP.getPointer() && "Added to wrong list?");
   }
 }
 
@@ -493,14 +498,14 @@ void ValueHandleBase::AddToExistingUseListAfter(ValueHandleBase *List) {
 
 /// AddToUseList - Add this ValueHandle to the use list for VP.
 void ValueHandleBase::AddToUseList() {
-  assert(VP && "Null pointer doesn't have a use list!");
+  assert(VP.getPointer() && "Null pointer doesn't have a use list!");
 
-  LLVMContextImpl *pImpl = VP->getContext().pImpl;
+  LLVMContextImpl *pImpl = VP.getPointer()->getContext().pImpl;
 
-  if (VP->HasValueHandle) {
+  if (VP.getPointer()->HasValueHandle) {
     // If this value already has a ValueHandle, then it must be in the
     // ValueHandles map already.
-    ValueHandleBase *&Entry = pImpl->ValueHandles[VP];
+    ValueHandleBase *&Entry = pImpl->ValueHandles[VP.getPointer()];
     assert(Entry != 0 && "Value doesn't have any handles?");
     AddToExistingUseList(&Entry);
     return;
@@ -514,10 +519,10 @@ void ValueHandleBase::AddToUseList() {
   DenseMap<Value*, ValueHandleBase*> &Handles = pImpl->ValueHandles;
   const void *OldBucketPtr = Handles.getPointerIntoBucketsArray();
 
-  ValueHandleBase *&Entry = Handles[VP];
+  ValueHandleBase *&Entry = Handles[VP.getPointer()];
   assert(Entry == 0 && "Value really did already have handles?");
   AddToExistingUseList(&Entry);
-  VP->HasValueHandle = true;
+  VP.getPointer()->HasValueHandle = true;
 
   // If reallocation didn't happen or if this was the first insertion, don't
   // walk the table.
@@ -529,14 +534,16 @@ void ValueHandleBase::AddToUseList() {
   // Okay, reallocation did happen.  Fix the Prev Pointers.
   for (DenseMap<Value*, ValueHandleBase*>::iterator I = Handles.begin(),
        E = Handles.end(); I != E; ++I) {
-    assert(I->second && I->first == I->second->VP && "List invariant broken!");
+    assert(I->second && I->first == I->second->VP.getPointer() &&
+           "List invariant broken!");
     I->second->setPrevPtr(&I->second);
   }
 }
 
 /// RemoveFromUseList - Remove this ValueHandle from its current use list.
 void ValueHandleBase::RemoveFromUseList() {
-  assert(VP && VP->HasValueHandle && "Pointer doesn't have a use list!");
+  assert(VP.getPointer() && VP.getPointer()->HasValueHandle &&
+         "Pointer doesn't have a use list!");
 
   // Unlink this from its use list.
   ValueHandleBase **PrevPtr = getPrevPtr();
@@ -552,11 +559,11 @@ void ValueHandleBase::RemoveFromUseList() {
   // If the Next pointer was null, then it is possible that this was the last
   // ValueHandle watching VP.  If so, delete its entry from the ValueHandles
   // map.
-  LLVMContextImpl *pImpl = VP->getContext().pImpl;
+  LLVMContextImpl *pImpl = VP.getPointer()->getContext().pImpl;
   DenseMap<Value*, ValueHandleBase*> &Handles = pImpl->ValueHandles;
   if (Handles.isPointerIntoBucketsArray(PrevPtr)) {
-    Handles.erase(VP);
-    VP->HasValueHandle = false;
+    Handles.erase(VP.getPointer());
+    VP.getPointer()->HasValueHandle = false;
   }
 }
 
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index 5b9b2a5..47baef3 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -51,6 +51,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Metadata.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
@@ -117,7 +118,6 @@ namespace {
   struct Verifier : public FunctionPass, public InstVisitor<Verifier> {
     static char ID; // Pass ID, replacement for typeid
     bool Broken;          // Is this module found to be broken?
-    bool RealPass;        // Are we not being run by a PassManager?
     VerifierFailureAction action;
                           // What to do if verification fails.
     Module *Mod;          // Module we are verifying right now
@@ -143,13 +143,13 @@ namespace {
     const Value *PersonalityFn;
 
     Verifier()
-      : FunctionPass(ID), Broken(false), RealPass(true),
+      : FunctionPass(ID), Broken(false),
         action(AbortProcessAction), Mod(0), Context(0), DT(0),
         MessagesStr(Messages), PersonalityFn(0) {
       initializeVerifierPass(*PassRegistry::getPassRegistry());
     }
     explicit Verifier(VerifierFailureAction ctn)
-      : FunctionPass(ID), Broken(false), RealPass(true), action(ctn), Mod(0),
+      : FunctionPass(ID), Broken(false), action(ctn), Mod(0),
         Context(0), DT(0), MessagesStr(Messages), PersonalityFn(0) {
       initializeVerifierPass(*PassRegistry::getPassRegistry());
     }
@@ -158,17 +158,14 @@ namespace {
       Mod = &M;
       Context = &M.getContext();
 
-      // If this is a real pass, in a pass manager, we must abort before
-      // returning back to the pass manager, or else the pass manager may try to
-      // run other passes on the broken module.
-      if (RealPass)
-        return abortIfBroken();
-      return false;
+      // We must abort before returning back to the pass manager, or else the
+      // pass manager may try to run other passes on the broken module.
+      return abortIfBroken();
     }
 
     bool runOnFunction(Function &F) {
       // Get dominator information if we are being run by PassManager
-      if (RealPass) DT = &getAnalysis<DominatorTree>();
+      DT = &getAnalysis<DominatorTree>();
 
       Mod = F.getParent();
       if (!Context) Context = &F.getContext();
@@ -177,13 +174,9 @@ namespace {
       InstsInThisBlock.clear();
       PersonalityFn = 0;
 
-      // If this is a real pass, in a pass manager, we must abort before
-      // returning back to the pass manager, or else the pass manager may try to
-      // run other passes on the broken module.
-      if (RealPass)
-        return abortIfBroken();
-
-      return false;
+      // We must abort before returning back to the pass manager, or else the
+      // pass manager may try to run other passes on the broken module.
+      return abortIfBroken();
     }
 
     bool doFinalization(Module &M) {
@@ -214,8 +207,7 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
       AU.addRequiredID(PreVerifyID);
-      if (RealPass)
-        AU.addRequired<DominatorTree>();
+      AU.addRequired<DominatorTree>();
     }
 
     /// abortIfBroken - If the module is broken and we are supposed to abort on
@@ -1369,6 +1361,25 @@ void Verifier::visitLoadInst(LoadInst &LI) {
     Assert1(LI.getSynchScope() == CrossThread,
             "Non-atomic load cannot have SynchronizationScope specified", &LI);
   }
+
+  if (MDNode *Range = LI.getMetadata(LLVMContext::MD_range)) {
+    unsigned NumOperands = Range->getNumOperands();
+    Assert1(NumOperands % 2 == 0, "Unfinished range!", Range);
+    unsigned NumRanges = NumOperands / 2;
+    Assert1(NumRanges >= 1, "It should have at least one range!", Range);
+    for (unsigned i = 0; i < NumRanges; ++i) {
+      ConstantInt *Low = dyn_cast<ConstantInt>(Range->getOperand(2*i));
+      Assert1(Low, "The lower limit must be an integer!", Low);
+      ConstantInt *High = dyn_cast<ConstantInt>(Range->getOperand(2*i + 1));
+      Assert1(High, "The upper limit must be an integer!", High);
+      Assert1(High->getType() == Low->getType() &&
+              High->getType() == ElTy, "Range types must match load type!",
+              &LI);
+      Assert1(High->getValue() != Low->getValue(), "Range must not be empty!",
+              Range);
+    }
+  }
+
   visitInstruction(LI);
 }
 
@@ -1641,6 +1652,24 @@ void Verifier::visitInstruction(Instruction &I) {
               "Cannot take the address of an inline asm!", &I);
     }
   }
+
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_fpmath)) {
+    Assert1(I.getType()->isFPOrFPVectorTy(),
+            "fpmath requires a floating point result!", &I);
+    Assert1(MD->getNumOperands() == 1, "fpmath takes one operand!", &I);
+    Value *Op0 = MD->getOperand(0);
+    if (ConstantFP *CFP0 = dyn_cast_or_null<ConstantFP>(Op0)) {
+      APFloat Accuracy = CFP0->getValueAPF();
+      Assert1(Accuracy.isNormal() && !Accuracy.isNegative(),
+              "fpmath accuracy not a positive number!", &I);
+    } else {
+      Assert1(false, "invalid fpmath accuracy!", &I);
+    }
+  }
+
+  MDNode *MD = I.getMetadata(LLVMContext::MD_range);
+  Assert1(!MD || isa<LoadInst>(I), "Ranges are only for loads!", &I);
+
   InstsInThisBlock.insert(&I);
 }
author	Shih-wei Liao <sliao@google.com>	2012-04-24 11:26:46 -0700
committer	Shih-wei Liao <sliao@google.com>	2012-04-24 11:26:46 -0700
commit	cf5a1461acaace0f3e7d11fbbcfbf635b8c8ea9d (patch)
tree	557137810ae9efc96147d672d372e4dabd0a2440 /lib
parent	4c8fab82874a29dcd2b242533af3ebe7f66bfd74 (diff)
parent	fc728fbdc2631ce8f343cf9b7292d218fde7419f (diff)
download	external_llvm-cf5a1461acaace0f3e7d11fbbcfbf635b8c8ea9d.zip external_llvm-cf5a1461acaace0f3e7d11fbbcfbf635b8c8ea9d.tar.gz external_llvm-cf5a1461acaace0f3e7d11fbbcfbf635b8c8ea9d.tar.bz2